scrapemateapp

package

v1.6.3 Latest Latest Go to latest Published: Feb 28, 2025 License: MIT Imports: 18 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/EmreKaplaner/scrapemate

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func DisableImages() func(*jsOptions)
func Headfull() func(*jsOptions)
func WithBrowserReuseLimit(limit int) func(*Config) error
func WithCache(cacheType, cachePath string) func(*Config) error
func WithConcurrency(concurrency int) func(*Config) error
func WithExitOnInactivity(duration time.Duration) func(*Config) error
func WithInitJob(job scrapemate.IJob) func(*Config) error
func WithJS(opts ...func(*jsOptions)) func(*Config) error
func WithPageReuseLimit(limit int) func(*Config) error
func WithProvider(provider scrapemate.JobProvider) func(*Config) error
func WithProxies(proxies []string) func(*Config) error
func WithStealth(browser string) func(*Config) error
func WithUA(ua string) func(*jsOptions)
type Config
- func NewConfig(writers []scrapemate.ResultWriter, options ...func(*Config) error) (*Config, error)
type RetryConfig
type ScrapemateApp
- func NewScrapeMateApp(cfg *Config, externalFetcher scrapemate.HTTPFetcher) (*ScrapemateApp, error)
- func (app *ScrapemateApp) Close() error
- func (app *ScrapemateApp) IsHealthy() (bool, error)
- func (app *ScrapemateApp) Start(ctx context.Context, seedJobs ...scrapemate.IJob) error

Constants ¶

View Source

const (
	DefaultConcurrency = 1
	DefaultProvider    = "memory"
)

Variables ¶

View Source

var DefaultRetryConfig = RetryConfig{
	MaxAttempts:     3,
	InitialInterval: 100 * time.Millisecond,
	MaxInterval:     2 * time.Second,
	Multiplier:      2.0,
}

DefaultRetryConfig provides reasonable defaults for retries

Functions ¶

func DisableImages ¶

func DisableImages() func(*jsOptions)

DisableImages is a helper function to disable images in the browser.

func Headfull ¶

func Headfull() func(*jsOptions)

Headfull is a helper function to create a headfull browser. Use it as a parameter to WithJS.

func WithBrowserReuseLimit ¶

func WithBrowserReuseLimit(limit int) func(*Config) error

WithBrowserReuseLimit sets the browser reuse limit of the app.

func WithCache ¶

func WithCache(cacheType, cachePath string) func(*Config) error

WithCache sets the cache type and path of the app.

func WithConcurrency ¶

func WithConcurrency(concurrency int) func(*Config) error

WithConcurrency sets the concurrency of the app.

func WithExitOnInactivity ¶

func WithExitOnInactivity(duration time.Duration) func(*Config) error

WithExitOnInactivity sets the duration after which the app will exit if there are no more jobs to run.

func WithInitJob ¶

func WithInitJob(job scrapemate.IJob) func(*Config) error

WithInitJob sets the initial job of the app.

func WithJS ¶

func WithJS(opts ...func(*jsOptions)) func(*Config) error

WithJS sets the app to use JavaScript to render the pages.

func WithPageReuseLimit ¶

func WithPageReuseLimit(limit int) func(*Config) error

WithPageReuseLimit sets the page reuse limit of the app.

func WithProvider ¶

func WithProvider(provider scrapemate.JobProvider) func(*Config) error

WithProvider sets the provider of the app.

func WithProxies ¶

func WithProxies(proxies []string) func(*Config) error

WithProxies sets the proxies of the app.

func WithStealth ¶

func WithStealth(browser string) func(*Config) error

func WithUA ¶

func WithUA(ua string) func(*jsOptions)

WithUA sets the user agent of the browser.

Types ¶

type Config ¶

type Config struct {
	// Concurrency is the number of concurrent scrapers to run.
	// If not set, it defaults to 1.
	Concurrency int `validate:"required,gte=1"`

	// Cache is the cache to use for storing scraped data.
	// If left empty then no caching will be used.
	// Otherwise the CacheType must be one of file or leveldb.
	CacheType string `validate:"omitempty,oneof=file leveldb"`
	// CachePath is the path to the cache file or directory.
	// It is required to be a valid path if CacheType is set.
	CachePath string `validate:"required_with=CacheType"`

	// UseJS is whether to use JavaScript to render the page.
	UseJS bool `validate:"omitempty"`
	// UseStealth is whether to use stealth mode to scrape the page.
	// uses a special http client to scrape the page.
	UseStealth bool `validate:"omitempty"`
	// StealthBrowser is the browser to use for stealth mode.
	StealthBrowser string `validate:"omitempty"`
	// JSOpts are the options for the JavaScript renderer.
	JSOpts jsOptions

	// ProviderType is the type of provider to use.
	// It is required to be a valid type if Provider is set.
	// If not set the memory provider will be used.
	Provider scrapemate.JobProvider

	// Writers are the writers to use for writing the results.
	// At least one writer must be provided.
	Writers []scrapemate.ResultWriter `validate:"required,gt=0"`
	// InitJob is the job to initialize the app with.
	InitJob scrapemate.IJob
	// ExitOnInactivityDuration is whether to exit the app when there are no more jobs to run.
	ExitOnInactivityDuration time.Duration
	// Proxies are the proxies to use for the app.
	Proxies []string
	// BrowserReuseLimit is the limit of browser reuse.
	// Only applicable when using JavaScript renderer.
	// By default it is 0, which means the browser will be reused indefinitely.
	BrowserReuseLimit int
	// PageReuseLimit is the limit of page reuse.
	// Only applicable when using JavaScript renderer.
	// By default it is 0, which means the page will not be reused.
	PageReuseLimit int
}

func NewConfig ¶

func NewConfig(writers []scrapemate.ResultWriter, options ...func(*Config) error) (*Config, error)

NewConfig creates a new config with default values.

type RetryConfig ¶ added in v1.6.1

type RetryConfig struct {
	MaxAttempts     int
	InitialInterval time.Duration
	MaxInterval     time.Duration
	Multiplier      float64
}

RetryConfig holds configuration for retry operations

type ScrapemateApp ¶

type ScrapemateApp struct {
	// contains filtered or unexported fields
}

func NewScrapeMateApp ¶

func NewScrapeMateApp(cfg *Config, externalFetcher scrapemate.HTTPFetcher) (*ScrapemateApp, error)

NewScrapeMateApp creates a new ScrapemateApp, optionally accepting an external fetcher.

func (*ScrapemateApp) Close ¶

func (app *ScrapemateApp) Close() error

Close closes the app.

func (*ScrapemateApp) IsHealthy ¶ added in v1.6.1

func (app *ScrapemateApp) IsHealthy() (bool, error)

IsHealthy returns the current health status of the app

func (*ScrapemateApp) Start ¶

func (app *ScrapemateApp) Start(ctx context.Context, seedJobs ...scrapemate.IJob) error

Start starts the app.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL