generate

package
v0.5.38 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 12, 2024 License: GPL-3.0 Imports: 25 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ConfigurationsForAllSubpages

func ConfigurationsForAllSubpages(opts ConfigOptions, pageConfigs map[string]*scrape.Config, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)

func ConfigurationsForGQDocument

func ConfigurationsForGQDocument(opts ConfigOptions, gqdoc *goquery.Document, minOcc int, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)

func ConfigurationsForPage

func ConfigurationsForPage(opts ConfigOptions, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)

func ConfigurationsForPageWithMinOccurrences

func ConfigurationsForPageWithMinOccurrences(opts ConfigOptions, gqdoc *goquery.Document, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)

func ConfigurationsForSubpages

func ConfigurationsForSubpages(opts ConfigOptions, pjs []*pageJoin, gqdocsByURL map[string]*goquery.Document) (map[string]*scrape.Config, map[string]*goquery.Document, error)

ConfigurationsForSubpages collects the URL values for a candidate subpage field, retrieves the pages at those URLs, concatenates them, trains a scraper to extract from those subpages, and merges the resulting ItemMap into the parent page, outputting the result.

func ExtendPageConfigsWithNexts

func ExtendPageConfigsWithNexts(opts ConfigOptions, pageConfigs map[string]*scrape.Config, gqdocsByURL map[string]*goquery.Document) error

Types

type Analyzer

type Analyzer struct {
	Tokenizer   *html.Tokenizer
	LocMan      locationManager
	PagMan      locationManager
	NextPaths   locationManager
	NumChildren map[string]int    // the number of children a node (represented by a path) has, including non-html-tag nodes (ie text)
	ChildNodes  map[string][]node // the children of the node at the specified nodePath; used for :nth-child() logic
	NodePath    path
	Depth       int
	InBody      bool
	FindNext    bool
}

Analyzer contains all the necessary config parameters and structs needed to analyze the webpage.

func (*Analyzer) Parse

func (a *Analyzer) Parse()

func (*Analyzer) ParseToken

func (a *Analyzer) ParseToken(tt html.TokenType) bool

type ConfigOptions

type ConfigOptions struct {
	Batch           bool
	CacheInputDir   string
	CacheOutputDir  string
	ConfigOutputDir string
	DoSubpages      bool
	MinOccs         []int
	ModelName       string
	Offline         bool
	OnlyVarying     bool
	RenderJS        bool
	URL             string
	WordsDir        string
	// contains filtered or unexported fields
}

func InitOpts

func InitOpts(opts ConfigOptions) (ConfigOptions, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL