bluesnake

package module

v0.0.0-...-b5dec86 Latest Latest Go to latest Published: Oct 11, 2025 License: Apache-2.0 Imports: 48 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/agentberlin/bluesnake

Links

Open Source Insights

README ¶

Bluesnake

Download

Documentation ¶

Index ¶

Constants
Variables
func CloseGlobalRenderer()
func ComputeContentHash(content []byte, algorithm string) (string, error)
func ComputeContentHashWithConfig(html []byte, algorithm string, config *ContentHashConfig) (string, error)
func FetchSitemapURLs(sitemapURL string) ([]string, error)
func NormalizeContent(html []byte, config *ContentHashConfig) ([]byte, error)
func SanitizeFileName(fileName string) string
func SetFetch(f func(URL string, options interface{}) ([]byte, error))
func SetInterval(time time.Duration)
func TryDefaultSitemaps(baseURL string) []string
func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error
type AlreadyVisitedError
- func (e *AlreadyVisitedError) Error() string
type Collector
- func NewCollector(config *CollectorConfig) *Collector
- func (c *Collector) Appengine(ctx context.Context)
- func (c *Collector) Clone() *Collector
- func (c *Collector) Cookies(URL string) []*http.Cookie
- func (c *Collector) DisableCookies()
- func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error)
- func (c *Collector) HasVisited(URL string) (bool, error)
- func (c *Collector) Head(URL string) error
- func (c *Collector) Init()
- func (c *Collector) IsCancelled() bool
- func (c *Collector) Limit(rule *LimitRule) error
- func (c *Collector) Limits(rules []*LimitRule) error
- func (c *Collector) OnError(f ErrorCallback)
- func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback)
- func (c *Collector) OnHTMLDetach(goquerySelector string)
- func (c *Collector) OnRequest(f RequestCallback)
- func (c *Collector) OnRequestHeaders(f RequestCallback)
- func (c *Collector) OnResponse(f ResponseCallback)
- func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback)
- func (c *Collector) OnScraped(f ScrapedCallback)
- func (c *Collector) OnXML(xpathQuery string, f XMLCallback)
- func (c *Collector) OnXMLDetach(xpathQuery string)
- func (c *Collector) Post(URL string, requestData map[string]string) error
- func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error
- func (c *Collector) PostRaw(URL string, requestData []byte) error
- func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error
- func (c *Collector) SetClient(client *http.Client)
- func (c *Collector) SetCookieJar(j http.CookieJar)
- func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error
- func (c *Collector) SetDebugger(d debug.Debugger)
- func (c *Collector) SetProxy(proxyURL string) error
- func (c *Collector) SetProxyFunc(p ProxyFunc)
- func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error)
- func (c *Collector) SetRequestTimeout(timeout time.Duration)
- func (c *Collector) SetStorage(s storage.Storage) error
- func (c *Collector) String() string
- func (c *Collector) UnmarshalRequest(r []byte) (*Request, error)
- func (c *Collector) Visit(URL string) error
- func (c *Collector) Wait()
- func (c *Collector) WithTransport(transport http.RoundTripper)
type CollectorConfig
- func NewDefaultConfig() *CollectorConfig
type ContentHashConfig
type Context
- func NewContext() *Context
- func (c *Context) Clone() *Context
- func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}
- func (c *Context) Get(key string) string
- func (c *Context) GetAny(key string) interface{}
- func (c *Context) MarshalBinary() (_ []byte, _ error)
- func (c *Context) Put(key string, value interface{})
- func (c *Context) UnmarshalBinary(_ []byte) error
type Crawler
- func NewCrawler(config *CollectorConfig) *Crawler
- func (cr *Crawler) SetOnCrawlComplete(f OnCrawlCompleteFunc)
- func (cr *Crawler) SetOnPageCrawled(f OnPageCrawledFunc)
- func (cr *Crawler) Start(url string) error
- func (cr *Crawler) Wait()
type DiscoveryMechanism
type ErrorCallback
type HTMLCallback
type HTMLElement
- func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement
- func (h *HTMLElement) Attr(k string) string
- func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
- func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
- func (h *HTMLElement) ChildText(goquerySelector string) string
- func (h *HTMLElement) ChildTexts(goquerySelector string) []string
- func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
- func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool)
- func (h *HTMLElement) Unmarshal(v interface{}) error
- func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error
type HTTPTrace
- func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request
type Index
- func ParseIndex(data []byte) (Index, error)
- func ReadSitemapIndex(path string) (Index, error)
type LimitRule
- func (r *LimitRule) Init() error
- func (r *LimitRule) Match(domain string) bool
type Link
type Links
type MockResponse
type MockTransport
- func NewMockTransport() *MockTransport
- func (m *MockTransport) RegisterError(url string, err error)
- func (m *MockTransport) RegisterHTML(url, html string)
- func (m *MockTransport) RegisterJSON(url, json string)
- func (m *MockTransport) RegisterPattern(pattern string, response *MockResponse) error
- func (m *MockTransport) RegisterResponse(url string, response *MockResponse)
- func (m *MockTransport) Reset()
- func (m *MockTransport) RoundTrip(req *http.Request) (*http.Response, error)
- func (m *MockTransport) SetFallback(fallback http.RoundTripper)
type OnCrawlCompleteFunc
type OnPageCrawledFunc
type PageMetadata
type PageResult
- func (pr *PageResult) GetHTML() string
- func (pr *PageResult) GetTextContent() string
- func (pr *PageResult) GetTextFull() string
type ProxyFunc
type Request
- func (r *Request) Abort()
- func (r *Request) AbsoluteURL(u string) string
- func (r *Request) Do() error
- func (r *Request) HasVisited(URL string) (bool, error)
- func (r *Request) IsAbort() bool
- func (r *Request) Marshal() ([]byte, error)
- func (r *Request) New(method, URL string, body io.Reader) (*Request, error)
- func (r *Request) Post(URL string, requestData map[string]string) error
- func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error
- func (r *Request) PostRaw(URL string, requestData []byte) error
- func (r *Request) Retry() error
- func (r *Request) Visit(URL string) error
type RequestCallback
type Response
- func (r *Response) FileName() string
- func (r *Response) Save(fileName string) error
type ResponseCallback
type ResponseHeadersCallback
type ScrapedCallback
type Sitemap
- func ForceGet(URL string, options interface{}) (Sitemap, error)
- func Get(URL string, options interface{}) (Sitemap, error)
- func Parse(data []byte) (Sitemap, error)
- func ReadSitemap(path string) (Sitemap, error)
type URL
type XMLCallback
type XMLElement
- func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement
- func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement
- func (h *XMLElement) Attr(k string) string
- func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string
- func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string
- func (h *XMLElement) ChildText(xpathQuery string) string
- func (h *XMLElement) ChildTexts(xpathQuery string) []string

Constants ¶

View Source

const (
	ProxyURLKey key = iota
	CheckRevisitKey
)

ProxyURLKey is the context key for the request proxy address.

Variables ¶

View Source

var (
	// ErrForbiddenDomain is the error thrown if visiting
	// a domain which is not allowed in AllowedDomains
	ErrForbiddenDomain = errors.New("Forbidden domain")
	// ErrMissingURL is the error type for missing URL errors
	ErrMissingURL = errors.New("Missing URL")
	// ErrMaxDepth is the error type for exceeding max depth
	ErrMaxDepth = errors.New("Max depth limit reached")
	// ErrForbiddenURL is the error thrown if visiting
	// a URL which is not allowed by URLFilters
	ErrForbiddenURL = errors.New("ForbiddenURL")

	// ErrNoURLFiltersMatch is the error thrown if visiting
	// a URL which is not allowed by URLFilters
	ErrNoURLFiltersMatch = errors.New("No URLFilters match")
	// ErrRobotsTxtBlocked is the error type for robots.txt errors
	ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt")
	// ErrNoCookieJar is the error type for missing cookie jar
	ErrNoCookieJar = errors.New("Cookie jar is not available")
	// ErrNoPattern is the error type for LimitRules without patterns
	ErrNoPattern = errors.New("No pattern defined in LimitRule")
	// ErrEmptyProxyURL is the error type for empty Proxy URL list
	ErrEmptyProxyURL = errors.New("Proxy URL list is empty")
	// ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer.
	ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
	// ErrAbortedBeforeRequest is the error returned when OnResponseHeaders aborts the transfer.
	ErrAbortedBeforeRequest = errors.New("Aborted before Do Request")
	// ErrQueueFull is the error returned when the queue is full
	ErrQueueFull = errors.New("Queue MaxSize reached")
	// ErrMaxRequests is the error returned when exceeding max requests
	ErrMaxRequests = errors.New("Max Requests limit reached")
	// ErrRetryBodyUnseekable is the error when retry with not seekable body
	ErrRetryBodyUnseekable = errors.New("Retry Body Unseekable")
)

View Source

var ErrMockNotFound = errors.New("no mock response registered for URL")

ErrMockNotFound is returned when no mock is registered for a URL

Functions ¶

func CloseGlobalRenderer ¶

func CloseGlobalRenderer()

CloseGlobalRenderer closes the global renderer instance This should be called when the application exits

func ComputeContentHash ¶

func ComputeContentHash(content []byte, algorithm string) (string, error)

ComputeContentHash computes a hash of the normalized content using the specified algorithm

func ComputeContentHashWithConfig ¶

func ComputeContentHashWithConfig(html []byte, algorithm string, config *ContentHashConfig) (string, error)

ComputeContentHashWithConfig is a convenience function that normalizes content and computes its hash

func FetchSitemapURLs ¶

func FetchSitemapURLs(sitemapURL string) ([]string, error)

FetchSitemapURLs fetches a specific sitemap URL and returns all discovered URLs. It handles both regular sitemaps and sitemap indexes automatically. Returns an empty slice if the sitemap cannot be fetched or parsed.

func NormalizeContent ¶

func NormalizeContent(html []byte, config *ContentHashConfig) ([]byte, error)

NormalizeContent normalizes HTML content based on the provided configuration to make content hashing more reliable by removing dynamic elements

func SanitizeFileName ¶

func SanitizeFileName(fileName string) string

SanitizeFileName replaces dangerous characters in a string so the return value can be used as a safe file name.

func SetFetch ¶

func SetFetch(f func(URL string, options interface{}) ([]byte, error))

SetFetch change fetch closure

func SetInterval ¶

func SetInterval(time time.Duration)

SetInterval change Time interval to be used in Index.get

func TryDefaultSitemaps ¶

func TryDefaultSitemaps(baseURL string) []string

TryDefaultSitemaps tries to fetch sitemaps from common default locations. It tries /sitemap.xml first, then /sitemap_index.xml. Returns all discovered URLs from available sitemaps (empty slice if none found). This function does not return errors - it returns empty slice if no sitemaps found.

func UnmarshalHTML ¶

func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error

UnmarshalHTML declaratively extracts text or attributes to a struct from HTML response using struct tags composed of css selectors. Allowed struct tags:

"selector" (required): CSS (goquery) selector of the desired data
"attr" (optional): Selects the matching element's attribute's value. Leave it blank or omit to get the text of the element.

Example struct declaration:

type Nested struct {
	String  string   `selector:"div > p"`
   Classes []string `selector:"li" attr:"class"`
	Struct  *Nested  `selector:"div > div"`
}

Supported types: struct, *struct, string, []string

Types ¶

type AlreadyVisitedError ¶

type AlreadyVisitedError struct {
	// Destination is the URL that was attempted to be visited.
	// It might not match the URL passed to Visit if redirect
	// was followed.
	Destination *url.URL
}

AlreadyVisitedError is the error type for already visited URLs.

It's returned synchronously by Visit when the URL passed to Visit is already visited.

When already visited URL is encountered after following redirects, this error appears in OnError callback, and if Async mode is not enabled, is also returned by Visit.

func (*AlreadyVisitedError) Error ¶

func (e *AlreadyVisitedError) Error() string

Error implements error interface.

type Collector ¶

type Collector struct {
	// UserAgent is the User-Agent string used by HTTP requests
	UserAgent string
	// Custom headers for the request
	Headers *http.Header
	// MaxDepth limits the recursion depth of visited URLs.
	// Set it to 0 for infinite recursion (default).
	MaxDepth int
	// AllowedDomains is a domain whitelist.
	// Leave it blank to allow any domains to be visited
	AllowedDomains []string
	// DisallowedDomains is a domain blacklist.
	DisallowedDomains []string
	// DisallowedURLFilters is a list of regular expressions which restricts
	// visiting URLs. If any of the rules matches to a URL the
	// request will be stopped. DisallowedURLFilters will
	// be evaluated before URLFilters
	// Leave it blank to allow any URLs to be visited
	DisallowedURLFilters []*regexp.Regexp

	// Leave it blank to allow any URLs to be visited
	URLFilters []*regexp.Regexp

	// AllowURLRevisit allows multiple downloads of the same URL
	AllowURLRevisit bool
	// MaxBodySize is the limit of the retrieved response body in bytes.
	// 0 means unlimited.
	// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
	MaxBodySize int
	// CacheDir specifies a location where GET requests are cached as files.
	// When it's not defined, caching is disabled.
	CacheDir string
	// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
	// the target host's robots.txt file.  See http://www.robotstxt.org/ for more
	// information.
	IgnoreRobotsTxt bool
	// Async turns on asynchronous network communication. Use Collector.Wait() to
	// be sure all requests have been finished.
	Async bool
	// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
	// By default, BlueSnake parses only successful HTTP responses. Set ParseHTTPErrorResponse
	// to true to enable it.
	ParseHTTPErrorResponse bool
	// ID is the unique identifier of a collector
	ID uint32
	// DetectCharset can enable character encoding detection for non-utf8 response bodies
	// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
	DetectCharset bool

	// CheckHead performs a HEAD request before every GET to pre-validate the response
	CheckHead bool
	// TraceHTTP enables capturing and reporting request performance for crawler tuning.
	// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
	TraceHTTP bool
	// Context is the context that will be used for HTTP requests. You can set this
	// to support clean cancellation of scraping.
	Context context.Context
	// MaxRequests limit the number of requests done by the instance.
	// Set it to 0 for infinite requests (default).
	MaxRequests uint32
	// EnableRendering enables JavaScript rendering using headless Chrome.
	// When set to true, pages will be rendered with chromedp before parsing.
	EnableRendering bool
	// EnableContentHash enables content-based duplicate detection
	EnableContentHash bool
	// ContentHashAlgorithm specifies the hash algorithm to use ("xxhash", "md5", "sha256")
	ContentHashAlgorithm string
	// ContentHashConfig contains detailed configuration for content hashing
	ContentHashConfig *ContentHashConfig

	// CacheExpiration sets the maximum age for cache files.
	// If a cached file is older than this duration, it will be ignored and refreshed.
	CacheExpiration time.Duration
	// contains filtered or unexported fields
}

Collector provides the scraper instance for a scraping job

func NewCollector ¶

func NewCollector(config *CollectorConfig) *Collector

NewCollector creates a new Collector instance with the provided configuration. If config is nil, default configuration is used.

func (*Collector) Appengine ¶

func (c *Collector) Appengine(ctx context.Context)

Appengine will replace the Collector's backend http.Client With an Http.Client that is provided by appengine/urlfetch This function should be used when the scraper is run on Google App Engine. Example:

func startScraper(w http.ResponseWriter, r *http.Request) {
  ctx := appengine.NewContext(r)
  c := bluesnake.NewCollector()
  c.Appengine(ctx)
   ...
  c.Visit("https://google.ca")
}

func (*Collector) Clone ¶

func (c *Collector) Clone() *Collector

Clone creates an exact copy of a Collector without callbacks. HTTP backend, robots.txt cache and cookie jar are shared between collectors.

func (*Collector) Cookies ¶

func (c *Collector) Cookies(URL string) []*http.Cookie

Cookies returns the cookies to send in a request for the given URL.

func (*Collector) DisableCookies ¶

func (c *Collector) DisableCookies()

DisableCookies turns off cookie handling

func (*Collector) HasPosted ¶

func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error)

HasPosted checks if the provided URL and requestData has been visited This method is useful more likely to prevent re-visit same URL and POST body

func (*Collector) HasVisited ¶

func (c *Collector) HasVisited(URL string) (bool, error)

HasVisited checks if the provided URL has been visited

func (*Collector) Head ¶

func (c *Collector) Head(URL string) error

Head starts a collector job by creating a HEAD request.

func (*Collector) Init ¶

func (c *Collector) Init()

Init initializes the Collector's private variables and sets default configuration for the Collector

func (*Collector) IsCancelled ¶

func (c *Collector) IsCancelled() bool

IsCancelled returns true if the collector's context is cancelled

func (*Collector) Limit ¶

func (c *Collector) Limit(rule *LimitRule) error

Limit adds a new LimitRule to the collector

func (*Collector) Limits ¶

func (c *Collector) Limits(rules []*LimitRule) error

Limits adds new LimitRules to the collector

func (*Collector) OnError ¶

func (c *Collector) OnError(f ErrorCallback)

OnError registers a function. Function will be executed if an error occurs during the HTTP request.

func (*Collector) OnHTML ¶

func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback)

OnHTML registers a function. Function will be executed on every HTML element matched by the GoQuery Selector parameter. GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery

func (*Collector) OnHTMLDetach ¶

func (c *Collector) OnHTMLDetach(goquerySelector string)

OnHTMLDetach deregister a function. Function will not be execute after detached

func (*Collector) OnRequest ¶

func (c *Collector) OnRequest(f RequestCallback)

OnRequest registers a function. Function will be executed on every request made by the Collector

func (*Collector) OnRequestHeaders ¶

func (c *Collector) OnRequestHeaders(f RequestCallback)

OnRequestHeaders registers a function. Function will be executed on every request made by the Collector before Request Do

func (*Collector) OnResponse ¶

func (c *Collector) OnResponse(f ResponseCallback)

OnResponse registers a function. Function will be executed on every response

func (*Collector) OnResponseHeaders ¶

func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback)

OnResponseHeaders registers a function. Function will be executed on every response when headers and status are already received, but body is not yet read.

Like in OnRequest, you can call Request.Abort to abort the transfer. This might be useful if, for example, you're following all hyperlinks, but want to avoid downloading files.

Be aware that using this will prevent HTTP/1.1 connection reuse, as the only way to abort a download is to immediately close the connection. HTTP/2 doesn't suffer from this problem, as it's possible to close specific stream inside the connection.

func (*Collector) OnScraped ¶

func (c *Collector) OnScraped(f ScrapedCallback)

OnScraped registers a function that will be executed as the final part of the scraping, after OnHTML and OnXML have finished.

func (*Collector) OnXML ¶

func (c *Collector) OnXML(xpathQuery string, f XMLCallback)

OnXML registers a function. Function will be executed on every XML element matched by the xpath Query parameter. xpath Query is used by https://github.com/antchfx/xmlquery

func (*Collector) OnXMLDetach ¶

func (c *Collector) OnXMLDetach(xpathQuery string)

OnXMLDetach deregister a function. Function will not be execute after detached

func (*Collector) Post ¶

func (c *Collector) Post(URL string, requestData map[string]string) error

Post starts a collector job by creating a POST request. Post also calls the previously provided callbacks

func (*Collector) PostMultipart ¶

func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error

PostMultipart starts a collector job by creating a Multipart POST request with raw binary data. PostMultipart also calls the previously provided callbacks

func (*Collector) PostRaw ¶

func (c *Collector) PostRaw(URL string, requestData []byte) error

PostRaw starts a collector job by creating a POST request with raw binary data. Post also calls the previously provided callbacks

func (*Collector) Request ¶

func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error

Request starts a collector job by creating a custom HTTP request where method, context, headers and request data can be specified. Set requestData, ctx, hdr parameters to nil if you don't want to use them. Valid methods:

"GET"
"HEAD"
"POST"
"PUT"
"DELETE"
"PATCH"
"OPTIONS"

func (*Collector) SetClient ¶

func (c *Collector) SetClient(client *http.Client)

SetClient will override the previously set http.Client

func (*Collector) SetCookieJar ¶

func (c *Collector) SetCookieJar(j http.CookieJar)

SetCookieJar overrides the previously set cookie jar

func (*Collector) SetCookies ¶

func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error

SetCookies handles the receipt of the cookies in a reply for the given URL

func (*Collector) SetDebugger ¶

func (c *Collector) SetDebugger(d debug.Debugger)

SetDebugger attaches a debugger to the collector

func (*Collector) SetProxy ¶

func (c *Collector) SetProxy(proxyURL string) error

SetProxy sets a proxy for the collector. This method overrides the previously used http.Transport if the type of the transport is not http.RoundTripper. The proxy type is determined by the URL scheme. "http" and "socks5" are supported. If the scheme is empty, "http" is assumed.

func (*Collector) SetProxyFunc ¶

func (c *Collector) SetProxyFunc(p ProxyFunc)

SetProxyFunc sets a custom proxy setter/switcher function. See built-in ProxyFuncs for more details. This method overrides the previously used http.Transport if the type of the transport is not *http.Transport. The proxy type is determined by the URL scheme. "http" and "socks5" are supported. If the scheme is empty, "http" is assumed.

func (*Collector) SetRedirectHandler ¶

func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error)

SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL

func (*Collector) SetRequestTimeout ¶

func (c *Collector) SetRequestTimeout(timeout time.Duration)

SetRequestTimeout overrides the default timeout (10 seconds) for this collector

func (*Collector) SetStorage ¶

func (c *Collector) SetStorage(s storage.Storage) error

SetStorage overrides the default in-memory storage. Storage stores scraping related data like cookies and visited urls

func (*Collector) String ¶

func (c *Collector) String() string

String is the text representation of the collector. It contains useful debug information about the collector's internals

func (*Collector) UnmarshalRequest ¶

func (c *Collector) UnmarshalRequest(r []byte) (*Request, error)

UnmarshalRequest creates a Request from serialized data

func (*Collector) Visit ¶

func (c *Collector) Visit(URL string) error

Visit starts Collector's collecting job by creating a request to the URL specified in parameter. Visit also calls the previously provided callbacks

func (*Collector) Wait ¶

func (c *Collector) Wait()

Wait returns when the collector jobs are finished

func (*Collector) WithTransport ¶

func (c *Collector) WithTransport(transport http.RoundTripper)

WithTransport allows you to set a custom http.RoundTripper (transport)

type CollectorConfig ¶

type CollectorConfig struct {
	// UserAgent is the User-Agent string used by HTTP requests
	UserAgent string
	// Headers contains custom headers for HTTP requests
	Headers map[string]string
	// MaxDepth limits the recursion depth of visited URLs.
	// Set it to 0 for infinite recursion (default).
	MaxDepth int
	// AllowedDomains is a domain whitelist.
	// Leave it blank to allow any domains to be visited
	AllowedDomains []string
	// DisallowedDomains is a domain blacklist.
	DisallowedDomains []string
	// DisallowedURLFilters is a list of regular expressions which restricts
	// visiting URLs. If any of the rules matches to a URL the
	// request will be stopped. DisallowedURLFilters will
	// be evaluated before URLFilters
	DisallowedURLFilters []*regexp.Regexp
	// URLFilters is a list of regular expressions which restricts
	// visiting URLs. If any of the rules matches to a URL the
	// request won't be stopped. DisallowedURLFilters will
	// be evaluated before URLFilters
	URLFilters []*regexp.Regexp
	// AllowURLRevisit allows multiple downloads of the same URL
	AllowURLRevisit bool
	// MaxBodySize is the limit of the retrieved response body in bytes.
	// 0 means unlimited.
	// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
	MaxBodySize int
	// CacheDir specifies a location where GET requests are cached as files.
	// When it's not defined, caching is disabled.
	CacheDir string
	// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
	// the target host's robots.txt file.
	IgnoreRobotsTxt bool
	// Async turns on asynchronous network communication.
	Async bool
	// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
	ParseHTTPErrorResponse bool
	// ID is the unique identifier of a collector (auto-assigned if 0)
	ID uint32
	// DetectCharset can enable character encoding detection for non-utf8 response bodies
	DetectCharset bool
	// CheckHead performs a HEAD request before every GET to pre-validate the response
	CheckHead bool
	// TraceHTTP enables capturing and reporting request performance.
	TraceHTTP bool
	// Context is the context that will be used for HTTP requests.
	Context context.Context
	// MaxRequests limit the number of requests done by the instance.
	// Set it to 0 for infinite requests (default).
	MaxRequests uint32
	// EnableRendering enables JavaScript rendering using headless Chrome.
	EnableRendering bool
	// CacheExpiration sets the maximum age for cache files.
	CacheExpiration time.Duration
	// Debugger is the debugger instance to use
	Debugger debug.Debugger
	// DiscoveryMechanisms specifies which mechanisms to use for URL discovery.
	// Can be any combination: ["spider"], ["sitemap"], or ["spider", "sitemap"].
	// Default is ["spider"].
	DiscoveryMechanisms []DiscoveryMechanism
	// SitemapURLs specifies custom sitemap URLs to fetch (optional).
	// If nil/empty when sitemap discovery is enabled, tries default locations
	// (/sitemap.xml, /sitemap_index.xml).
	SitemapURLs []string
	// EnableContentHash enables content-based duplicate detection
	// When true, pages with identical content will be detected even if URLs differ
	EnableContentHash bool
	// ContentHashAlgorithm specifies the hash algorithm to use
	// Options: "xxhash" (fastest, default), "md5", "sha256"
	ContentHashAlgorithm string
	// ContentHashConfig contains detailed configuration for content hashing
	ContentHashConfig *ContentHashConfig
}

CollectorConfig contains all configuration options for a Collector

func NewDefaultConfig ¶

func NewDefaultConfig() *CollectorConfig

NewDefaultConfig returns a CollectorConfig with sensible defaults

type ContentHashConfig ¶

type ContentHashConfig struct {
	// ExcludeTags specifies HTML tags to exclude from content hashing
	// Default: ["script", "style", "nav", "footer"]
	ExcludeTags []string
	// IncludeOnlyTags specifies to only include specific tags in content hashing
	// If empty, all content (minus ExcludeTags) is included
	// Example: ["article", "main"] to focus only on main content
	IncludeOnlyTags []string
	// StripTimestamps removes timestamp patterns from content before hashing
	StripTimestamps bool
	// StripAnalytics removes analytics and tracking code from content
	StripAnalytics bool
	// StripComments removes HTML comments from content
	StripComments bool
	// CollapseWhitespace normalizes whitespace (multiple spaces/newlines to single)
	CollapseWhitespace bool
}

ContentHashConfig contains configuration for content-based duplicate detection

type Context ¶

type Context struct {
	// contains filtered or unexported fields
}

Context provides a tiny layer for passing data between callbacks

func NewContext ¶

func NewContext() *Context

NewContext initializes a new Context instance

func (*Context) Clone ¶

func (c *Context) Clone() *Context

Clone clones context

func (*Context) ForEach ¶

func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}

ForEach iterate context

func (*Context) Get ¶

func (c *Context) Get(key string) string

Get retrieves a string value from Context. Get returns an empty string if key not found

func (*Context) GetAny ¶

func (c *Context) GetAny(key string) interface{}

GetAny retrieves a value from Context. GetAny returns nil if key not found

func (*Context) MarshalBinary ¶

func (c *Context) MarshalBinary() (_ []byte, _ error)

MarshalBinary encodes Context value This function is used by request caching

func (*Context) Put ¶

func (c *Context) Put(key string, value interface{})

Put stores a value of any type in Context

func (*Context) UnmarshalBinary ¶

func (c *Context) UnmarshalBinary(_ []byte) error

UnmarshalBinary decodes Context value to nil This function is used by request caching

type Crawler ¶

type Crawler struct {
	// Collector is the underlying low-level collector (exported for advanced configuration)
	Collector *Collector
	// contains filtered or unexported fields
}

Crawler provides a high-level interface for web crawling with callbacks for page results

func NewCrawler ¶

func NewCrawler(config *CollectorConfig) *Crawler

NewCrawler creates a high-level crawler with the specified collector configuration. The returned crawler must have its callbacks set via SetOnPageCrawled and SetOnCrawlComplete before calling Start. If config is nil, default configuration is used.

func (*Crawler) SetOnCrawlComplete ¶

func (cr *Crawler) SetOnCrawlComplete(f OnCrawlCompleteFunc)

SetOnCrawlComplete registers a callback function that will be called when the crawl finishes. This callback receives summary statistics about the completed crawl.

func (*Crawler) SetOnPageCrawled ¶

func (cr *Crawler) SetOnPageCrawled(f OnPageCrawledFunc)

SetOnPageCrawled registers a callback function that will be called after each page is crawled. This callback receives complete page information including discovered URLs.

func (*Crawler) Start ¶

func (cr *Crawler) Start(url string) error

Start begins crawling from the specified starting URL. It returns immediately if the crawler is in Async mode, or blocks until completion otherwise.

func (*Crawler) Wait ¶

func (cr *Crawler) Wait()

Wait blocks until all crawling operations complete. This is primarily useful when the crawler is in Async mode.

type DiscoveryMechanism ¶

type DiscoveryMechanism string

DiscoveryMechanism specifies how URLs are discovered during crawling

const (
	// DiscoverySpider discovers URLs by following links in HTML pages
	DiscoverySpider DiscoveryMechanism = "spider"
	// DiscoverySitemap discovers URLs from sitemap.xml files
	DiscoverySitemap DiscoveryMechanism = "sitemap"
)

type ErrorCallback ¶

type ErrorCallback func(*Response, error)

ErrorCallback is a type alias for OnError callback functions

type HTMLCallback ¶

type HTMLCallback func(*HTMLElement)

HTMLCallback is a type alias for OnHTML callback functions

type HTMLElement ¶

type HTMLElement struct {
	// Name is the name of the tag
	Name string
	Text string

	// Request is the request object of the element's HTML document
	Request *Request
	// Response is the Response object of the element's HTML document
	Response *Response
	// DOM is the goquery parsed DOM object of the page. DOM is relative
	// to the current HTMLElement
	DOM *goquery.Selection
	// Index stores the position of the current element within all the elements matched by an OnHTML callback
	Index int
	// contains filtered or unexported fields
}

HTMLElement is the representation of a HTML tag.

func NewHTMLElementFromSelectionNode ¶

func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement

NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.

func (*HTMLElement) Attr ¶

func (h *HTMLElement) Attr(k string) string

Attr returns the selected attribute of a HTMLElement or empty string if no attribute found

func (*HTMLElement) ChildAttr ¶

func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string

ChildAttr returns the stripped text content of the first matching element's attribute.

func (*HTMLElement) ChildAttrs ¶

func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string

ChildAttrs returns the stripped text content of all the matching element's attributes.

func (*HTMLElement) ChildText ¶

func (h *HTMLElement) ChildText(goquerySelector string) string

ChildText returns the concatenated and stripped text content of the matching elements.

func (*HTMLElement) ChildTexts ¶

func (h *HTMLElement) ChildTexts(goquerySelector string) []string

ChildTexts returns the stripped text content of all the matching elements.

func (*HTMLElement) ForEach ¶

func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))

ForEach iterates over the elements matched by the first argument and calls the callback function on every HTMLElement match.

func (*HTMLElement) ForEachWithBreak ¶

func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool)

ForEachWithBreak iterates over the elements matched by the first argument and calls the callback function on every HTMLElement match. It is identical to ForEach except that it is possible to break out of the loop by returning false in the callback function. It returns the current Selection object.

func (*HTMLElement) Unmarshal ¶

func (h *HTMLElement) Unmarshal(v interface{}) error

Unmarshal is a shorthand for bluesnake.UnmarshalHTML

func (*HTMLElement) UnmarshalWithMap ¶

func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error

UnmarshalWithMap is a shorthand for bluesnake.UnmarshalHTML, extended to allow maps to be passed in.

type HTTPTrace ¶

type HTTPTrace struct {
	ConnectDuration   time.Duration
	FirstByteDuration time.Duration
	// contains filtered or unexported fields
}

HTTPTrace provides a datastructure for storing an http trace.

func (*HTTPTrace) WithTrace ¶

func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request

WithTrace returns the given HTTP Request with this HTTPTrace added to its context.

type Index ¶

type Index struct {
	XMLName xml.Name `xml:"sitemapindex"`
	Sitemap []parts  `xml:"sitemap"`
}

Index is a structure of <sitemapindex>

func ParseIndex ¶

func ParseIndex(data []byte) (Index, error)

ParseIndex create Index data from text

func ReadSitemapIndex ¶

func ReadSitemapIndex(path string) (Index, error)

ReadSitemapIndex is a function that reads a file and returns a Index structure.

type LimitRule ¶

type LimitRule struct {
	// DomainRegexp is a regular expression to match against domains
	DomainRegexp string
	// DomainGlob is a glob pattern to match against domains
	DomainGlob string
	// Delay is the duration to wait before creating a new request to the matching domains
	Delay time.Duration
	// RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
	RandomDelay time.Duration
	// Parallelism is the number of the maximum allowed concurrent requests of the matching domains
	Parallelism int
	// contains filtered or unexported fields
}

LimitRule provides connection restrictions for domains. Both DomainRegexp and DomainGlob can be used to specify the included domains patterns, but at least one is required. There can be two kind of limitations:

Parallelism: Set limit for the number of concurrent requests to matching domains
Delay: Wait specified amount of time between requests (parallelism is 1 in this case)

func (*LimitRule) Init ¶

func (r *LimitRule) Init() error

Init initializes the private members of LimitRule

func (*LimitRule) Match ¶

func (r *LimitRule) Match(domain string) bool

Match checks that the domain parameter triggers the rule

type Link ¶

type Link struct {
	// URL is the target URL
	URL string `json:"url"`
	// Type is the link type: "anchor", "image", "script", "stylesheet", "iframe", "canonical", "video", "audio"
	Type string `json:"type"`
	// Text is the anchor text, alt text, or empty for other link types
	Text string `json:"text"`
	// Context is the surrounding text context where the link appears
	Context string `json:"context,omitempty"`
	// IsInternal indicates if this link points to the same domain/subdomain
	IsInternal bool `json:"isInternal"`
	// Status is the HTTP status code if this URL has been crawled (200, 404, 301, etc.)
	Status *int `json:"status,omitempty"`
	// Title is the page title if this URL has been crawled
	Title string `json:"title,omitempty"`
	// ContentType is the MIME type if this URL has been crawled
	ContentType string `json:"contentType,omitempty"`
	// Position indicates the semantic location of the link on the page
	// Values: "content", "navigation", "header", "footer", "sidebar", "breadcrumbs", "pagination", "unknown"
	Position string `json:"position,omitempty"`
	// DOMPath is a simplified DOM path showing the link's location in the HTML structure
	// Example: "body > main > article > p > a"
	DOMPath string `json:"domPath,omitempty"`
}

Link represents a single outbound link discovered on a page

type Links ¶

type Links struct {
	// Internal links point to same domain/subdomain
	Internal []Link `json:"internal"`
	// External links point to different domains
	External []Link `json:"external"`
}

Links contains outbound links from a page

type MockResponse ¶

type MockResponse struct {
	// StatusCode is the HTTP status code to return (default: 200)
	StatusCode int
	// Body is the response body content (used if BodyFunc is nil)
	Body string
	// BodyFunc is a function that generates the body dynamically based on the request
	// If set, this takes precedence over Body
	BodyFunc func(*http.Request) string
	// Headers are the HTTP headers to include in the response
	Headers http.Header
	// Delay simulates network latency before returning the response
	Delay time.Duration
	// Error simulates a network error
	Error error
}

MockResponse represents a mock HTTP response

type MockTransport ¶

type MockTransport struct {
	// contains filtered or unexported fields
}

MockTransport implements http.RoundTripper for testing purposes. It allows you to register mock responses for specific URLs or URL patterns without needing to run an actual HTTP server.

func NewMockTransport ¶

func NewMockTransport() *MockTransport

NewMockTransport creates a new MockTransport instance

func (*MockTransport) RegisterError ¶

func (m *MockTransport) RegisterError(url string, err error)

RegisterError registers a mock error for a URL (simulates network failure)

func (*MockTransport) RegisterHTML ¶

func (m *MockTransport) RegisterHTML(url, html string)

RegisterHTML is a convenience method to register an HTML response with status 200

func (*MockTransport) RegisterJSON ¶

func (m *MockTransport) RegisterJSON(url, json string)

RegisterJSON is a convenience method to register a JSON response with status 200

func (*MockTransport) RegisterPattern ¶

func (m *MockTransport) RegisterPattern(pattern string, response *MockResponse) error

RegisterPattern registers a mock response for URLs matching a regex pattern

func (*MockTransport) RegisterResponse ¶

func (m *MockTransport) RegisterResponse(url string, response *MockResponse)

RegisterResponse registers a mock response for an exact URL match

func (*MockTransport) Reset ¶

func (m *MockTransport) Reset()

Reset clears all registered responses and patterns

func (*MockTransport) RoundTrip ¶

func (m *MockTransport) RoundTrip(req *http.Request) (*http.Response, error)

RoundTrip implements the http.RoundTripper interface

func (*MockTransport) SetFallback ¶

func (m *MockTransport) SetFallback(fallback http.RoundTripper)

SetFallback sets a fallback RoundTripper to use when no mock is registered for a URL. This is useful for testing scenarios where you want to mock some URLs but allow real HTTP requests for others.

type OnCrawlCompleteFunc ¶

type OnCrawlCompleteFunc func(wasStopped bool, totalPages int, totalDiscovered int)

OnCrawlCompleteFunc is called when the entire crawl finishes, either naturally or due to cancellation. Parameters:

wasStopped: true if the crawl was stopped via context cancellation, false if it completed naturally
totalPages: total number of pages that were successfully crawled (excludes errors)
totalDiscovered: total number of unique URLs discovered during the crawl

type OnPageCrawledFunc ¶

type OnPageCrawledFunc func(*PageResult)

OnPageCrawledFunc is called after each individual page is successfully crawled or encounters an error. It receives the complete result of crawling that page including all discovered URLs.

type PageMetadata ¶

type PageMetadata struct {
	Status      int
	Title       string
	ContentType string
}

PageMetadata stores cached metadata for crawled pages

type PageResult ¶

type PageResult struct {
	// URL is the URL that was crawled
	URL string
	// Status is the HTTP status code (e.g., 200, 404, 500)
	Status int
	// Title is the page title extracted from the <title> tag (for HTML pages)
	Title string
	// MetaDescription is the content of the <meta name="description"> tag
	MetaDescription string
	// Indexable indicates if search engines can index this page
	// Values: "Yes", "No", or "-" for non-HTML resources
	Indexable string
	// ContentType is the Content-Type header value (e.g., "text/html", "application/json")
	ContentType string
	// Error contains any error message if the crawl failed, empty otherwise
	Error string
	// Links contains all outbound links from this page (internal and external)
	Links *Links
	// ContentHash is the hash of the normalized page content (empty if content hashing is disabled)
	ContentHash string
	// IsDuplicateContent indicates if this content hash has been seen before on a different URL
	IsDuplicateContent bool
	// contains filtered or unexported fields
}

PageResult contains all data collected from a single crawled page

func (*PageResult) GetHTML ¶

func (pr *PageResult) GetHTML() string

GetHTML returns the full HTML content of the page. Returns empty string if the response is not available.

func (*PageResult) GetTextContent ¶

func (pr *PageResult) GetTextContent() string

GetTextContent returns text from the main content area only (excluding navigation, headers, footers). Extracts text from semantic HTML5 elements like <article>, <main>, or [role="main"]. Returns empty string if the response is not available or is not HTML.

func (*PageResult) GetTextFull ¶

func (pr *PageResult) GetTextFull() string

GetTextFull returns all visible text from the entire page (including navigation, headers, footers). HTML tags are stripped, leaving only the text content. Returns empty string if the response is not available or is not HTML.

type ProxyFunc ¶

type ProxyFunc func(*http.Request) (*url.URL, error)

ProxyFunc is a type alias for proxy setter functions.

type Request ¶

type Request struct {
	// URL is the parsed URL of the HTTP request
	URL *url.URL
	// Headers contains the Request's HTTP headers
	Headers *http.Header
	// the Host header
	Host string
	// Ctx is a context between a Request and a Response
	Ctx *Context
	// Depth is the number of the parents of the request
	Depth int
	// Method is the HTTP method of the request
	Method string
	// Body is the request body which is used on POST/PUT requests
	Body io.Reader
	// ResponseCharacterencoding is the character encoding of the response body.
	// Leave it blank to allow automatic character encoding of the response body.
	// It is empty by default and it can be set in OnRequest callback.
	ResponseCharacterEncoding string
	// ID is the Unique identifier of the request
	ID uint32

	// ProxyURL is the proxy address that handles the request
	ProxyURL string
	// contains filtered or unexported fields
}

Request is the representation of a HTTP request made by a Collector

func (*Request) Abort ¶

func (r *Request) Abort()

Abort cancels the HTTP request when called in an OnRequest callback

func (*Request) AbsoluteURL ¶

func (r *Request) AbsoluteURL(u string) string

AbsoluteURL returns with the resolved absolute URL of an URL chunk. AbsoluteURL returns empty string if the URL chunk is a fragment or could not be parsed

func (*Request) Do ¶

func (r *Request) Do() error

Do submits the request

func (*Request) HasVisited ¶

func (r *Request) HasVisited(URL string) (bool, error)

HasVisited checks if the provided URL has been visited

func (*Request) IsAbort ¶

func (r *Request) IsAbort() bool

IsAbort returns true if the request has been aborted

func (*Request) Marshal ¶

func (r *Request) Marshal() ([]byte, error)

Marshal serializes the Request

func (*Request) New ¶

func (r *Request) New(method, URL string, body io.Reader) (*Request, error)

New creates a new request with the context of the original request

func (*Request) Post ¶

func (r *Request) Post(URL string, requestData map[string]string) error

Post continues a collector job by creating a POST request and preserves the Context of the previous request. Post also calls the previously provided callbacks

func (*Request) PostMultipart ¶

func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error

PostMultipart starts a collector job by creating a Multipart POST request with raw binary data. PostMultipart also calls the previously provided. callbacks

func (*Request) PostRaw ¶

func (r *Request) PostRaw(URL string, requestData []byte) error

PostRaw starts a collector job by creating a POST request with raw binary data. PostRaw preserves the Context of the previous request and calls the previously provided callbacks

func (*Request) Retry ¶

func (r *Request) Retry() error

Retry submits HTTP request again with the same parameters

func (*Request) Visit ¶

func (r *Request) Visit(URL string) error

Visit continues Collector's collecting job by creating a request and preserves the Context of the previous request. Visit also calls the previously provided callbacks

type RequestCallback ¶

type RequestCallback func(*Request)

RequestCallback is a type alias for OnRequest callback functions

type Response ¶

type Response struct {
	// StatusCode is the status code of the Response
	StatusCode int
	// Body is the content of the Response
	Body []byte
	// Ctx is a context between a Request and a Response
	Ctx *Context
	// Request is the Request object of the response
	Request *Request
	// Headers contains the Response's HTTP headers
	Headers *http.Header
	// Trace contains the HTTPTrace for the request. Will only be set by the
	// collector if Collector.TraceHTTP is set to true.
	Trace *HTTPTrace
}

Response is the representation of a HTTP response made by a Collector

func (*Response) FileName ¶

func (r *Response) FileName() string

FileName returns the sanitized file name parsed from "Content-Disposition" header or from URL

func (*Response) Save ¶

func (r *Response) Save(fileName string) error

Save writes response body to disk

type ResponseCallback ¶

type ResponseCallback func(*Response)

ResponseCallback is a type alias for OnResponse callback functions

type ResponseHeadersCallback ¶

type ResponseHeadersCallback func(*Response)

ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions

type ScrapedCallback ¶

type ScrapedCallback func(*Response)

ScrapedCallback is a type alias for OnScraped callback functions

type Sitemap ¶

type Sitemap struct {
	XMLName xml.Name `xml:"urlset"`
	URL     []URL    `xml:"url"`
}

Sitemap is a structure of <sitemap>

func ForceGet ¶

func ForceGet(URL string, options interface{}) (Sitemap, error)

ForceGet is fetch and parse sitemap.xml/sitemapindex.xml. The difference with the Get function is that it ignores some errors.

Errors to Ignore:

・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved. ・When sitemapindex.xml contains a sitemap.xml that is empty ・When sitemapindex.xml contains a sitemap.xml that has format problems.

Errors not to Ignore:

・When sitemap.xml/sitemapindex.xml could not retrieved. ・When sitemap.xml/sitemapindex.xml is empty. ・When sitemap.xml/sitemapindex.xml has format problems.

If you want **not** to ignore some errors, use the Get function.

func Get ¶

func Get(URL string, options interface{}) (Sitemap, error)

Get is fetch and parse sitemap.xml/sitemapindex.xml

If sitemap.xml or sitemapindex.xml has some problems, This function return error.

・When sitemap.xml/sitemapindex.xml could not retrieved. ・When sitemap.xml/sitemapindex.xml is empty. ・When sitemap.xml/sitemapindex.xml has format problems. ・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved. ・When sitemapindex.xml contains a sitemap.xml that is empty ・When sitemapindex.xml contains a sitemap.xml that has format problems.

If you want to ignore these errors, use the ForceGet function.

func Parse ¶

func Parse(data []byte) (Sitemap, error)

Parse create Sitemap data from text

func ReadSitemap ¶

func ReadSitemap(path string) (Sitemap, error)

ReadSitemap is a function that reads a file and returns a Sitemap structure.

type URL ¶

type URL struct {
	Loc        string  `xml:"loc"`
	LastMod    string  `xml:"lastmod"`
	ChangeFreq string  `xml:"changefreq"`
	Priority   float32 `xml:"priority"`
}

URL is a structure of <url> in <sitemap>

type XMLCallback ¶

type XMLCallback func(*XMLElement)

XMLCallback is a type alias for OnXML callback functions

type XMLElement ¶

type XMLElement struct {
	// Name is the name of the tag
	Name string
	Text string

	// Request is the request object of the element's HTML document
	Request *Request
	// Response is the Response object of the element's HTML document
	Response *Response
	// DOM is the DOM object of the page. DOM is relative
	// to the current XMLElement and is either a html.Node or xmlquery.Node
	// based on how the XMLElement was created.
	DOM interface{}

	// Index stores the position of the current element within all the elements matched by an OnXML callback
	Index int
	// contains filtered or unexported fields
}

XMLElement is the representation of a XML tag.

func NewXMLElementFromHTMLNode ¶

func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement

NewXMLElementFromHTMLNode creates a XMLElement from a html.Node.

func NewXMLElementFromXMLNode ¶

func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement

NewXMLElementFromXMLNode creates a XMLElement from a xmlquery.Node.

func (*XMLElement) Attr ¶

func (h *XMLElement) Attr(k string) string

Attr returns the selected attribute of a HTMLElement or empty string if no attribute found

func (*XMLElement) ChildAttr ¶

func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string

ChildAttr returns the stripped text content of the first matching element's attribute.

func (*XMLElement) ChildAttrs ¶

func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string

ChildAttrs returns the stripped text content of all the matching element's attributes.

func (*XMLElement) ChildText ¶

func (h *XMLElement) ChildText(xpathQuery string) string

ChildText returns the concatenated and stripped text content of the matching elements.

func (*XMLElement) ChildTexts ¶

func (h *XMLElement) ChildTexts(xpathQuery string) []string

ChildTexts returns an array of strings corresponding to child elements that match the xpath query. Each item in the array is the stripped text content of the corresponding matching child element.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
cmd
desktop command
server command
debug
extensions Package extensions implements various helper addons for BlueSnake	Package extensions implements various helper addons for BlueSnake
internal
app
store
types
version
proxy
queue
storage

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL