Documentation
¶
Overview ¶
Package mediawiki provides utilities for processing Wikipedia and Wikidata dumps.
Index ¶
- Variables
- func DecodeImageMetadata(metadata interface{}) (map[string]interface{}, errors.E)
- func LatestCommonsEntitiesRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E)
- func LatestCommonsImageMetadataRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E)
- func LatestWikidataEntitiesRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E)
- func LatestWikipediaImageMetadataRun(ctx context.Context, client *retryablehttp.Client, language string) (string, errors.E)
- func LatestWikipediaRun(ctx context.Context, client *retryablehttp.Client, language string, ...) (string, errors.E)
- func Process[T any](ctx context.Context, config *ProcessConfig[T]) errors.E
- func ProcessCommonsEntitiesDump(ctx context.Context, config *ProcessDumpConfig, ...) errors.E
- func ProcessWikidataDump(ctx context.Context, config *ProcessDumpConfig, ...) errors.E
- func ProcessWikipediaDump(ctx context.Context, config *ProcessDumpConfig, ...) errors.E
- type Amount
- type Article
- type ArticleBody
- type CalendarModel
- type Category
- type Compression
- type DataType
- type DataValue
- type Editor
- type Entity
- type EntityRef
- type EntityType
- type ErrorValue
- type Event
- type FileType
- type GlobeCoordinateValue
- type Image
- type InLanguage
- type InfoBox
- type IsPartOf
- type LanguageValue
- type License
- type Link
- type MaintenanceTags
- type MonolingualTextValue
- type Namespace
- type Probability
- type ProcessConfig
- type ProcessDumpConfig
- type Protection
- type QuantityValue
- type Redirect
- type Reference
- type Score
- type Scores
- type SiteLink
- type Size
- type Snak
- type SnakType
- type Statement
- type StatementRank
- type StatementType
- type StringValue
- type Template
- type TimePrecision
- type TimeValue
- type Version
- type Visibility
- type WikiBaseEntityIDValue
- type WikiBaseEntityType
Constants ¶
This section is empty.
Variables ¶
Functions ¶
func DecodeImageMetadata ¶
DecodeImageMetadata decodes image and other uploaded files metadata column in image table. See: https://www.mediawiki.org/wiki/Manual:Image_table
func LatestCommonsEntitiesRun ¶
LatestCommonsEntitiesRun returns URL of the latest run of Wikimedia Commons entities JSON dump.
func LatestCommonsImageMetadataRun ¶
func LatestCommonsImageMetadataRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E)
LatestCommonsImageMetadataRun returns URL of the latest run of Wikimedia Commons image table dump.
func LatestWikidataEntitiesRun ¶
func LatestWikidataEntitiesRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E)
LatestWikidataEntitiesRun returns URL of the latest run of Wikidata entities JSON dump.
func LatestWikipediaImageMetadataRun ¶
func LatestWikipediaImageMetadataRun(ctx context.Context, client *retryablehttp.Client, language string) (string, errors.E)
LatestWikipediaImageMetadataRun returns URL of the latest run of Wikipedia image table dump. Use "enwiki" for English Wikipedia.
func LatestWikipediaRun ¶
func LatestWikipediaRun(ctx context.Context, client *retryablehttp.Client, language string, namespace int) (string, errors.E)
LatestWikipediaRun returns URL of the latest run of Wikimedia Enterprise HTML dump. Use "enwiki" for English Wikipedia and namespace 0 for its articles.
func Process ¶
Process is a low-level function which decompresses a file (supports Compression compressions), extacts JSONs or SQL statements from it (stored in FileType types), decodes JSONs or SQL statements, and calls Process callback on each decoded JSON or SQL statement. All that in parallel fashion, controlled by DecompressionThreads, DecodingThreads, and ItemsProcessingThreads. File is downloaded from a HTTP URL and is processed already during download. Downloaded file is optionally saved (to a file at Path) and followup calls to Process can use a saved file (if same Path is provided).
func ProcessCommonsEntitiesDump ¶
func ProcessCommonsEntitiesDump( ctx context.Context, config *ProcessDumpConfig, processEntity func(context.Context, Entity) errors.E, ) errors.E
ProcessCommonsEntitiesDump downloads (unless already saved), decompresses, decodes JSON, and calls processEntity on every entity in a Wikimedia Commons entities JSON dump.
func ProcessWikidataDump ¶
func ProcessWikidataDump( ctx context.Context, config *ProcessDumpConfig, processEntity func(context.Context, Entity) errors.E, ) errors.E
ProcessWikidataDump downloads (unless already saves), decompresses, decodes JSON, and calls processEntity on every entity in a Wikidata entities JSON dump.
func ProcessWikipediaDump ¶
func ProcessWikipediaDump( ctx context.Context, config *ProcessDumpConfig, processArticle func(context.Context, Article) errors.E, ) errors.E
ProcessWikipediaDump downloads (unless already saves), decompresses, decodes JSON, and calls processArticle on every article in a Wikimedia Enterprise HTML dump.
Types ¶
type Amount ¶
Amount is an arbitrary precision number and extends big.Rat.
func (Amount) MarshalJSON ¶
MarshalJSON implements json.Marshaler interface for Amount.
func (*Amount) UnmarshalJSON ¶
UnmarshalJSON implements json.Unmarshaler interface for Amount.
type Article ¶
type Article struct { Name string `json:"name"` Identifier int64 `json:"identifier"` Abstract string `json:"abstract,omitempty"` WatchersCount int64 `json:"watchers_count,omitempty"` DateCreated time.Time `json:"date_created"` DateModified time.Time `json:"date_modified"` DatePreviouslyModified *time.Time `json:"date_previously_modified,omitempty"` Protection []Protection `json:"protection,omitempty"` Version Version `json:"version"` PreviousVersion *Version `json:"previous_version,omitempty"` URL string `json:"url"` Namespace Namespace `json:"namespace"` InLanguage InLanguage `json:"in_language"` MainEntity *EntityRef `json:"main_entity,omitempty"` AdditionalEntities []EntityRef `json:"additional_entities,omitempty"` Categories []Category `json:"categories,omitempty"` Templates []Template `json:"templates,omitempty"` Redirects []Redirect `json:"redirects,omitempty"` IsPartOf IsPartOf `json:"is_part_of"` ArticleBody ArticleBody `json:"article_body"` License []License `json:"license,omitempty"` Visibility *Visibility `json:"visibility,omitempty"` Image *Image `json:"image,omitempty"` Event Event `json:"event"` InfoBox []InfoBox `json:"infobox,omitempty"` }
Article is a Wikimedia Enterprise HTML dump article.
type ArticleBody ¶
type CalendarModel ¶
type CalendarModel int
const ( Gregorian CalendarModel = iota Julian )
func (CalendarModel) MarshalJSON ¶
func (t CalendarModel) MarshalJSON() ([]byte, error)
MarshalJSON implements json.Marshaler interface for CalendarModel.
Go enumeration values are converted to corresponding calendar Wikidata URIs. Those might be different (but equivalent) than what it was in the source dump.
func (*CalendarModel) UnmarshalJSON ¶
func (t *CalendarModel) UnmarshalJSON(b []byte) error
UnmarshalJSON implements json.Unmarshaler interface for CalendarModel.
It normalizes calendar Wikidata URIs to Go enumeration values.
type Compression ¶
type Compression int
const ( NoCompression Compression = iota Tar BZIP2 BZIP2Tar GZIP GZIPTar )
type DataValue ¶
type DataValue struct {
Value interface{} `json:"value"`
}
DataValue provides parsed value as Go value in Value.
Value can be one of ErrorValue, StringValue, WikiBaseEntityIDValue, GlobeCoordinateValue, MonolingualTextValue, QuantityValue, and TimeValue.
func (DataValue) MarshalJSON ¶
MarshalJSON implements json.Marshaler interface for DataValue.
JSON representation of Go values might be different (but equivalent) than what it was in the source dump.
func (*DataValue) UnmarshalJSON ¶
UnmarshalJSON implements json.Unmarshaler interface for DataValue.
It normalizes JSON representation to Go values.
type Editor ¶
type Editor struct { Identifier int64 `json:"identifier,omitempty"` IsAnonymous bool `json:"is_anonymous,omitempty"` IsBot bool `json:"is_bot,omitempty"` IsAdmin bool `json:"is_admin,omitempty"` IsPatroller bool `json:"is_patroller,omitempty"` HasAdvancedRights bool `json:"has_advanced_rights,omitempty"` Name string `json:"name,omitempty"` EditCount int64 `json:"edit_count,omitempty"` DateStarted *time.Time `json:"date_started,omitempty"` Groups []string `json:"groups,omitempty"` }
type Entity ¶
type Entity struct { ID string `json:"id"` PageID int64 `json:"pageid"` Namespace int `json:"ns"` Title string `json:"title"` Modified time.Time `json:"modified"` Type EntityType `json:"type"` DataType *DataType `json:"datatype,omitempty"` Labels map[string]LanguageValue `json:"labels,omitempty"` Descriptions map[string]LanguageValue `json:"descriptions,omitempty"` Aliases map[string][]LanguageValue `json:"aliases,omitempty"` Claims map[string][]Statement `json:"claims,omitempty"` SiteLinks map[string]SiteLink `json:"sitelinks,omitempty"` LastRevID int64 `json:"lastrevid"` }
Entity is a Wikidata entities JSON dump entity.
type EntityType ¶
type EntityType int
const ( Item EntityType = iota Property MediaInfo )
func (EntityType) MarshalJSON ¶
func (t EntityType) MarshalJSON() ([]byte, error)
func (*EntityType) UnmarshalJSON ¶
func (t *EntityType) UnmarshalJSON(b []byte) error
type ErrorValue ¶
type ErrorValue string
ErrorValue represents an error with the value.
When JSON representation contains an error, only error is provided as a Go value because any other field might be fail to parse.
type GlobeCoordinateValue ¶
type InLanguage ¶
type InLanguage struct {
Identifier string `json:"identifier"`
}
type LanguageValue ¶
type MaintenanceTags ¶
type MonolingualTextValue ¶
type Probability ¶
type ProcessConfig ¶
type ProcessConfig[T any] struct { URL string Path string Client *retryablehttp.Client DecompressionThreads int DecodingThreads int ItemsProcessingThreads int Process func(context.Context, T) errors.E Progress func(context.Context, x.Progress) FileType FileType Compression Compression }
ProcessConfig is a configuration for low-level Process function.
URL or Path, Process, FileType, and Compression are required. If URL is provided and Path does not already exist, Client is required, too.
If just URL is provided, but not Path, then Process downloads and processes the file at URL, but does not save it. If both URL and Path are provided, and there file at Path does not exist, then Process saves the file at Path while downloading and processing the file at URL. If the file at Path already exists, then Process just uses it as-is and does not download anything from URL.
Client should set User-Agent header with contact information, e.g.:
client := retryablehttp.NewClient() client.RequestLogHook = func(logger retryablehttp.Logger, req *http.Request, retry int) { req.Header.Set("User-Agent", "My bot (user@example.com)") }
type ProcessDumpConfig ¶
type ProcessDumpConfig struct { URL string Path string Client *retryablehttp.Client DecompressionThreads int DecodingThreads int ItemsProcessingThreads int Progress func(context.Context, x.Progress) }
ProcessDumpConfig is a configuration for high-level Process*Dump functions.
URL or Path are required. If URL is provided and Path does not already exist, Client is required, too.
Client should set User-Agent header with contact information, e.g.:
client := retryablehttp.NewClient() client.RequestLogHook = func(logger retryablehttp.Logger, req *http.Request, retry int) { req.Header.Set("User-Agent", "My bot (user@example.com)") }
type Protection ¶
type QuantityValue ¶
type Score ¶
type Score struct { Prediction bool `json:"prediction"` Probability Probability `json:"probability"` }
type Statement ¶
type Statement struct { ID string `json:"id"` Type StatementType `json:"type"` MainSnak Snak `json:"mainsnak"` Rank StatementRank `json:"rank"` Qualifiers map[string][]Snak `json:"qualifiers,omitempty"` QualifiersOrder []string `json:"qualifiers-order,omitempty"` //nolint:tagliatelle References []Reference `json:"references,omitempty"` }
type StatementRank ¶
type StatementRank int
const ( Preferred StatementRank = iota Normal Deprecated )
func (StatementRank) MarshalJSON ¶
func (r StatementRank) MarshalJSON() ([]byte, error)
func (*StatementRank) UnmarshalJSON ¶
func (r *StatementRank) UnmarshalJSON(b []byte) error
type StatementType ¶
type StatementType int
const (
StatementT StatementType = iota
)
func (StatementType) MarshalJSON ¶
func (t StatementType) MarshalJSON() ([]byte, error)
func (*StatementType) UnmarshalJSON ¶
func (t *StatementType) UnmarshalJSON(b []byte) error
type StringValue ¶
type StringValue string
type TimePrecision ¶
type TimePrecision int
const ( BillionYears TimePrecision = iota HoundredMillionYears TenMillionYears MillionYears HoundredMillenniums TenMillenniums Millennium Century Decade Year Month Day Hour Minute Second )
type TimeValue ¶
type TimeValue struct { Time time.Time `json:"time"` Precision TimePrecision `json:"precision"` Calendar CalendarModel `json:"calendar"` }
TimeValue represents a time value.
While Time is a regular time.Time struct with nanoseconds precision, its real precision is available by Precision.
Note that Wikidata uses historical numbering, in which year 0 is undefined and 1 BCE is represented by -1, but time.Time uses astronomical numbering, in which 1 BCE is represented by 0.
func (TimeValue) MarshalJSON ¶
MarshalJSON implements json.Marshaler interface for TimeValue.
func (*TimeValue) UnmarshalJSON ¶
UnmarshalJSON implements json.Unmarshaler interface for TimeValue.
type Version ¶
type Version struct { Identifier int64 `json:"identifier"` Editor *Editor `json:"editor,omitempty"` Comment string `json:"comment,omitempty"` Tags []string `json:"tags,omitempty"` HasTagNeedsCitation bool `json:"has_tag_needs_citation,omitempty"` IsMinorEdit bool `json:"is_minor_edit,omitempty"` IsFlaggedStable bool `json:"is_flagged_stable,omitempty"` Scores *Scores `json:"scores,omitempty"` Size *Size `json:"size,omitempty"` NumberOfCharacters int64 `json:"number_of_characters,omitempty"` Event Event `json:"event"` IsBreakingMews bool `json:"is_breaking_news,omitempty"` NoIndex bool `json:"noindex,omitempty"` MaintenanceTags *MaintenanceTags `json:"maintenance_tags,omitempty"` }
type Visibility ¶
type WikiBaseEntityIDValue ¶
type WikiBaseEntityIDValue struct { Type WikiBaseEntityType `json:"entity-type"` //nolint:tagliatelle ID string `json:"id"` }
type WikiBaseEntityType ¶
type WikiBaseEntityType int
const ( ItemType WikiBaseEntityType = iota PropertyType LexemeType FormType SenseType EntitySchemaType )
func (WikiBaseEntityType) MarshalJSON ¶
func (t WikiBaseEntityType) MarshalJSON() ([]byte, error)
func (*WikiBaseEntityType) UnmarshalJSON ¶
func (t *WikiBaseEntityType) UnmarshalJSON(b []byte) error