Documentation
¶
Index ¶
- func ConfigurationsForAllSubpages(opts ConfigOptions, pageConfigs map[string]*scrape.Config, ...) (map[string]*scrape.Config, map[string]*goquery.Document, error)
- func ConfigurationsForGQDocument(opts ConfigOptions, gqdoc *goquery.Document, minOcc int, ...) (map[string]*scrape.Config, map[string]*goquery.Document, error)
- func ConfigurationsForPage(opts ConfigOptions, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)
- func ConfigurationsForPageWithMinOccurrences(opts ConfigOptions, gqdoc *goquery.Document, ...) (map[string]*scrape.Config, map[string]*goquery.Document, error)
- func ConfigurationsForSubpages(opts ConfigOptions, pjs []*pageJoin, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)
- func ExtendPageConfigsWithNexts(opts ConfigOptions, pageConfigs map[string]*scrape.Config, ...) error
- type Analyzer
- type ConfigOptions
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ConfigurationsForPage ¶
func ConfigurationsForSubpages ¶
func ConfigurationsForSubpages(opts ConfigOptions, pjs []*pageJoin, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)
ConfigurationsForSubpages collects the URL values for a candidate subpage field, retrieves the pages at those URLs, concatenates them, trains a scraper to extract from those subpages, and merges the resulting ItemMap into the parent page, outputting the result.
Types ¶
type Analyzer ¶
type Analyzer struct { Tokenizer *html.Tokenizer LocMan locationManager PagMan locationManager NextPaths locationManager NumChildren map[string]int // the number of children a node (represented by a path) has, including non-html-tag nodes (ie text) ChildNodes map[string][]node // the children of the node at the specified nodePath; used for :nth-child() logic NodePath path Depth int InBody bool FindNext bool }
Analyzer contains all the necessary config parameters and structs needed to analyze the webpage.
type ConfigOptions ¶
type ConfigOptions struct { Batch bool CacheInputDir string CacheOutputDir string ConfigOutputDir string DoSubpages bool MinOccs []int ModelName string Offline bool OnlyVarying bool RenderJS bool URL string WordsDir string // contains filtered or unexported fields }
func InitOpts ¶
func InitOpts(opts ConfigOptions) (ConfigOptions, error)
Click to show internal directories.
Click to hide internal directories.