crawler

package

v0.0.11 Latest Latest Go to latest Published: Jul 31, 2025 License: Apache-2.0 Imports: 14 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/deepnoodle-ai/web

Links

Open Source Insights

Documentation ¶

Index ¶

type Callback
type Crawler
- func New(opts Options) (*Crawler, error)
- func (c *Crawler) AddFetcherRules(rules ...*FetcherRule) error
- func (c *Crawler) AddParserRules(rule ...*ParserRule) error
- func (c *Crawler) Crawl(ctx context.Context, urls []string, callback Callback) error
- func (c *Crawler) GetStats() *CrawlerStats
- func (c *Crawler) Stop()
type CrawlerStats
- func (s *CrawlerStats) GetFailed() int64
- func (s *CrawlerStats) GetProcessed() int64
- func (s *CrawlerStats) GetSucceeded() int64
- func (s *CrawlerStats) IncrementFailed()
- func (s *CrawlerStats) IncrementProcessed()
- func (s *CrawlerStats) IncrementSucceeded()
type FetcherRule
- func NewFetcherRule(pattern string, fetcher fetch.Fetcher, opts ...FetcherRuleOption) *FetcherRule
type FetcherRuleOption
- func WithFetcherMatchType(matchType MatchType) FetcherRuleOption
- func WithFetcherPriority(priority int) FetcherRuleOption
type FollowBehavior
type MatchRule
- func (r *MatchRule) Compile() error
- func (r *MatchRule) Matches(value string) bool
type MatchType
type MockParser
- func NewMockParser() *MockParser
- func (m *MockParser) Parse(ctx context.Context, page *fetch.Response) (any, error)
- func (m *MockParser) SetParseFunc(fn func(ctx context.Context, page *fetch.Response) (any, error))
type Options
type Parser
type ParserRule
- func NewParserRule(pattern string, parser Parser, opts ...ParserRuleOption) *ParserRule
type ParserRuleOption
- func WithParserMatchType(matchType MatchType) ParserRuleOption
- func WithParserPriority(priority int) ParserRuleOption
type Result

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Callback ¶

type Callback func(ctx context.Context, result *Result)

ProcessCallback is called with the fetch request and parsed result (if any)

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler is used to crawl the web.

func New ¶

func New(opts Options) (*Crawler, error)

New creates a new crawler.

func (*Crawler) AddFetcherRules ¶

func (c *Crawler) AddFetcherRules(rules ...*FetcherRule) error

AddFetcherRules adds new fetcher rules to the crawler. The rules will be re-sorted by priority after adding.

func (*Crawler) AddParserRules ¶

func (c *Crawler) AddParserRules(rule ...*ParserRule) error

AddParserRules adds new parser rules to the crawler. The rules will be re-sorted by priority after adding.

func (*Crawler) Crawl ¶

func (c *Crawler) Crawl(ctx context.Context, urls []string, callback Callback) error

Crawl the provided URLs and call the callback for each processed page. Links may be followed depending on the configured follow behavior.

func (*Crawler) GetStats ¶

func (c *Crawler) GetStats() *CrawlerStats

GetStats returns the current crawling statistics

func (*Crawler) Stop ¶

func (c *Crawler) Stop()

type CrawlerStats ¶

type CrawlerStats struct {
	// contains filtered or unexported fields
}

CrawlerStats tracks crawling statistics. All methods are thread-safe.

func (*CrawlerStats) GetFailed ¶

func (s *CrawlerStats) GetFailed() int64

GetFailed returns the number of URLs that failed to process

func (*CrawlerStats) GetProcessed ¶

func (s *CrawlerStats) GetProcessed() int64

GetProcessed returns the number of URLs processed

func (*CrawlerStats) GetSucceeded ¶

func (s *CrawlerStats) GetSucceeded() int64

GetSucceeded returns the number of URLs successfully processed

func (*CrawlerStats) IncrementFailed ¶

func (s *CrawlerStats) IncrementFailed()

IncrementFailed atomically increments the failed counter

func (*CrawlerStats) IncrementProcessed ¶

func (s *CrawlerStats) IncrementProcessed()

IncrementProcessed atomically increments the processed counter

func (*CrawlerStats) IncrementSucceeded ¶

func (s *CrawlerStats) IncrementSucceeded()

IncrementSucceeded atomically increments the succeeded counter

type FetcherRule ¶

type FetcherRule struct {
	MatchRule
	Fetcher fetch.Fetcher // The fetcher to use for matching domains
}

FetcherRule defines a rule for matching domains to fetchers

func NewFetcherRule ¶

func NewFetcherRule(pattern string, fetcher fetch.Fetcher, opts ...FetcherRuleOption) *FetcherRule

NewFetcherRule creates a new fetcher rule with the given pattern and fetcher. By default, it uses exact matching with priority 0. Use functional options to customize behavior.

Example:

rule := NewFetcherRule("example.com", fetcher, WithFetcherPriority(10))
rule := NewFetcherRule("*.example.com", fetcher, WithFetcherMatchType(MatchGlob), WithFetcherPriority(5))

type FetcherRuleOption ¶

type FetcherRuleOption func(*FetcherRule)

FetcherRuleOption defines a function that modifies a FetcherRule

func WithFetcherMatchType ¶

func WithFetcherMatchType(matchType MatchType) FetcherRuleOption

WithFetcherMatchType sets the match type for a fetcher rule

func WithFetcherPriority ¶

func WithFetcherPriority(priority int) FetcherRuleOption

WithFetcherPriority sets the priority for a fetcher rule

type FollowBehavior ¶

type FollowBehavior string

FollowBehavior is used to determine how to follow links.

const (
	FollowAny               FollowBehavior = "any"
	FollowSameDomain        FollowBehavior = "same-domain"
	FollowRelatedSubdomains FollowBehavior = "related-subdomains"
	FollowNone              FollowBehavior = "none"
)

type MatchRule ¶

type MatchRule struct {
	Pattern  string    // The pattern to match against
	Type     MatchType // The type of matching to perform
	Priority int       // Priority for rule evaluation (higher = first)
	// contains filtered or unexported fields
}

MatchRule defines the core matching logic that can be used by different rule types

func (*MatchRule) Compile ¶

func (r *MatchRule) Compile() error

Compile compiles regex patterns for the match rule if needed

func (*MatchRule) Matches ¶

func (r *MatchRule) Matches(value string) bool

Matches checks if the given value matches the rule

type MatchType ¶

type MatchType string

MatchType defines the type of pattern matching for rules

const (
	MatchExact  MatchType = "exact"  // Exact domain match
	MatchRegex  MatchType = "regex"  // Regular expression match
	MatchSuffix MatchType = "suffix" // Domain suffix match (e.g., ".com")
	MatchPrefix MatchType = "prefix" // Domain prefix match (e.g., "blog.")
	MatchGlob   MatchType = "glob"   // Glob pattern match (e.g., "*.example.com")
)

type MockParser ¶

type MockParser struct {
	mock.Mock
	// contains filtered or unexported fields
}

MockParser implements the Parser interface for testing

func NewMockParser ¶

func NewMockParser() *MockParser

func (*MockParser) Parse ¶

func (m *MockParser) Parse(ctx context.Context, page *fetch.Response) (any, error)

func (*MockParser) SetParseFunc ¶

func (m *MockParser) SetParseFunc(fn func(ctx context.Context, page *fetch.Response) (any, error))

type Options ¶

type Options struct {
	MaxURLs              int
	Workers              int
	Cache                cache.Cache
	RequestDelay         time.Duration
	KnownURLs            []string
	ParserRules          []*ParserRule
	DefaultParser        Parser
	FetcherRules         []*FetcherRule
	DefaultFetcher       fetch.Fetcher
	FollowBehavior       FollowBehavior
	Logger               *slog.Logger
	ShowProgress         bool
	ShowProgressInterval time.Duration
	QueueSize            int
}

Options used to configure a crawler.

type Parser ¶

type Parser interface {
	Parse(ctx context.Context, page *fetch.Response) (any, error)
}

Parser is an interface describing a webpage parser. It accepts the fetched page and returns a parsed object.

type ParserRule ¶

type ParserRule struct {
	MatchRule
	Parser Parser // The parser to use for matching domains
}

ParserRule defines a rule for matching domains to parsers

func NewParserRule ¶

func NewParserRule(pattern string, parser Parser, opts ...ParserRuleOption) *ParserRule

NewParserRule creates a new parser rule with the given pattern and parser. By default, it uses exact matching with priority 0. Use functional options to customize behavior.

Example:

rule := NewParserRule("example.com", parser, WithParserPriority(10))
rule := NewParserRule("*.example.com", parser, WithParserMatchType(MatchGlob), WithParserPriority(5))

type ParserRuleOption ¶

type ParserRuleOption func(*ParserRule)

ParserRuleOption defines a function that modifies a ParserRule

func WithParserMatchType ¶

func WithParserMatchType(matchType MatchType) ParserRuleOption

WithParserMatchType sets the match type for a parser rule

func WithParserPriority ¶

func WithParserPriority(priority int) ParserRuleOption

WithParserPriority sets the priority for a parser rule

type Result ¶

type Result struct {
	URL      *url.URL
	Parsed   any
	Links    []string
	Response *fetch.Response
	Error    error
}

Result represents the result of one page being crawled.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL