Documentation
¶
Index ¶
- Constants
- Variables
- func DisableImages() func(*jsOptions)
- func Headfull() func(*jsOptions)
- func WithBrowserReuseLimit(limit int) func(*Config) error
- func WithCache(cacheType, cachePath string) func(*Config) error
- func WithConcurrency(concurrency int) func(*Config) error
- func WithExitOnInactivity(duration time.Duration) func(*Config) error
- func WithInitJob(job scrapemate.IJob) func(*Config) error
- func WithJS(opts ...func(*jsOptions)) func(*Config) error
- func WithPageReuseLimit(limit int) func(*Config) error
- func WithProvider(provider scrapemate.JobProvider) func(*Config) error
- func WithProxies(proxies []string) func(*Config) error
- func WithStealth(browser string) func(*Config) error
- func WithUA(ua string) func(*jsOptions)
- type Config
- type RetryConfig
- type ScrapemateApp
Constants ¶
const ( DefaultConcurrency = 1 DefaultProvider = "memory" )
Variables ¶
var DefaultRetryConfig = RetryConfig{ MaxAttempts: 3, InitialInterval: 100 * time.Millisecond, MaxInterval: 2 * time.Second, Multiplier: 2.0, }
DefaultRetryConfig provides reasonable defaults for retries
Functions ¶
func DisableImages ¶
func DisableImages() func(*jsOptions)
DisableImages is a helper function to disable images in the browser.
func Headfull ¶
func Headfull() func(*jsOptions)
Headfull is a helper function to create a headfull browser. Use it as a parameter to WithJS.
func WithBrowserReuseLimit ¶
WithBrowserReuseLimit sets the browser reuse limit of the app.
func WithConcurrency ¶
WithConcurrency sets the concurrency of the app.
func WithExitOnInactivity ¶
WithExitOnInactivity sets the duration after which the app will exit if there are no more jobs to run.
func WithInitJob ¶
func WithInitJob(job scrapemate.IJob) func(*Config) error
WithInitJob sets the initial job of the app.
func WithPageReuseLimit ¶
WithPageReuseLimit sets the page reuse limit of the app.
func WithProvider ¶
func WithProvider(provider scrapemate.JobProvider) func(*Config) error
WithProvider sets the provider of the app.
func WithProxies ¶
WithProxies sets the proxies of the app.
func WithStealth ¶
Types ¶
type Config ¶
type Config struct { // Concurrency is the number of concurrent scrapers to run. // If not set, it defaults to 1. Concurrency int `validate:"required,gte=1"` // Cache is the cache to use for storing scraped data. // If left empty then no caching will be used. // Otherwise the CacheType must be one of file or leveldb. CacheType string `validate:"omitempty,oneof=file leveldb"` // CachePath is the path to the cache file or directory. // It is required to be a valid path if CacheType is set. CachePath string `validate:"required_with=CacheType"` // UseJS is whether to use JavaScript to render the page. UseJS bool `validate:"omitempty"` // UseStealth is whether to use stealth mode to scrape the page. // uses a special http client to scrape the page. UseStealth bool `validate:"omitempty"` // StealthBrowser is the browser to use for stealth mode. StealthBrowser string `validate:"omitempty"` // JSOpts are the options for the JavaScript renderer. JSOpts jsOptions // ProviderType is the type of provider to use. // It is required to be a valid type if Provider is set. // If not set the memory provider will be used. Provider scrapemate.JobProvider // Writers are the writers to use for writing the results. // At least one writer must be provided. Writers []scrapemate.ResultWriter `validate:"required,gt=0"` // InitJob is the job to initialize the app with. InitJob scrapemate.IJob // ExitOnInactivityDuration is whether to exit the app when there are no more jobs to run. ExitOnInactivityDuration time.Duration // Proxies are the proxies to use for the app. Proxies []string // BrowserReuseLimit is the limit of browser reuse. // Only applicable when using JavaScript renderer. // By default it is 0, which means the browser will be reused indefinitely. BrowserReuseLimit int // PageReuseLimit is the limit of page reuse. // Only applicable when using JavaScript renderer. // By default it is 0, which means the page will not be reused. PageReuseLimit int }
func NewConfig ¶
func NewConfig(writers []scrapemate.ResultWriter, options ...func(*Config) error) (*Config, error)
NewConfig creates a new config with default values.
type RetryConfig ¶ added in v1.6.1
type RetryConfig struct { MaxAttempts int InitialInterval time.Duration MaxInterval time.Duration Multiplier float64 }
RetryConfig holds configuration for retry operations
type ScrapemateApp ¶
type ScrapemateApp struct {
// contains filtered or unexported fields
}
func NewScrapeMateApp ¶
func NewScrapeMateApp(cfg *Config, externalFetcher scrapemate.HTTPFetcher) (*ScrapemateApp, error)
NewScrapeMateApp creates a new ScrapemateApp, optionally accepting an external fetcher.
func (*ScrapemateApp) IsHealthy ¶ added in v1.6.1
func (app *ScrapemateApp) IsHealthy() (bool, error)
IsHealthy returns the current health status of the app
func (*ScrapemateApp) Start ¶
func (app *ScrapemateApp) Start(ctx context.Context, seedJobs ...scrapemate.IJob) error
Start starts the app.