From 9bfee6b9251f9515cd079fc493b63398cd46c2bb Mon Sep 17 00:00:00 2001 From: Link Date: Sun, 23 Oct 2022 22:46:46 +0800 Subject: [PATCH] add cache only && optmize huge amount cache index --- colly.go | 34 +++++++++++++++++++++++++++------- http_backend.go | 12 ++++++++---- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/colly.go b/colly.go index 97febe516..4eb246a19 100644 --- a/colly.go +++ b/colly.go @@ -112,6 +112,8 @@ type Collector struct { // TraceHTTP enables capturing and reporting request performance for crawler tuning. // When set to true, the Response.Trace will be filled in with an HTTPTrace object. TraceHTTP bool + // CacheOnly will only returned cached request as required + CacheOnly bool // Context is the context that will be used for HTTP requests. You can set this // to support clean cancellation of scraping. Context context.Context @@ -228,6 +230,8 @@ var ( ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers") // ErrQueueFull is the error returned when the queue is full ErrQueueFull = errors.New("Queue MaxSize reached") + // ErrCacheNotFound is the error returned when cacheonly is turned on and the cache file is not found + ErrCacheNotFound = errors.New("Cache Not Found") ) var envMap = map[string]func(*Collector, string){ @@ -277,6 +281,9 @@ var envMap = map[string]func(*Collector, string){ "USER_AGENT": func(c *Collector, val string) { c.UserAgent = val }, + "CACHE_ONLY": func(c *Collector, val string) { + c.CacheOnly = isYesString(val) + }, } var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign()) @@ -443,6 +450,12 @@ func CheckHead() CollectorOption { } } +func CacheOnly() CollectorOption { + return func(c *Collector) { + c.CacheOnly = true + } +} + // Init initializes the Collector's private variables and sets default // configuration for the Collector func (c *Collector) Init() { @@ -456,6 +469,7 @@ func (c *Collector) Init() { jar, _ := cookiejar.New(nil) c.backend.Init(jar) c.backend.Client.CheckRedirect = c.checkRedirectFunc() + c.backend.CacheOnly = c.CacheOnly c.wg = &sync.WaitGroup{} c.lock = &sync.RWMutex{} c.robotsMap = make(map[string]*robotstxt.RobotsData) @@ -469,13 +483,14 @@ func (c *Collector) Init() { // With an Http.Client that is provided by appengine/urlfetch // This function should be used when the scraper is run on // Google App Engine. Example: -// func startScraper(w http.ResponseWriter, r *http.Request) { -// ctx := appengine.NewContext(r) -// c := colly.NewCollector() -// c.Appengine(ctx) -// ... -// c.Visit("https://google.ca") -// } +// +// func startScraper(w http.ResponseWriter, r *http.Request) { +// ctx := appengine.NewContext(r) +// c := colly.NewCollector() +// c.Appengine(ctx) +// ... +// c.Visit("https://google.ca") +// } func (c *Collector) Appengine(ctx context.Context) { client := urlfetch.Client(ctx) client.Jar = c.backend.Client.Jar @@ -972,6 +987,11 @@ func (c *Collector) SetClient(client *http.Client) { c.backend.Client = client } +// SetCacheOnly will only returned cached request as required +func (c *Collector) SetCacheOnly(val bool) { + c.backend.CacheOnly = val +} + // WithTransport allows you to set a custom http.RoundTripper (transport) func (c *Collector) WithTransport(transport http.RoundTripper) { c.backend.Client.Transport = transport diff --git a/http_backend.go b/http_backend.go index 926046b2d..d6db30e41 100644 --- a/http_backend.go +++ b/http_backend.go @@ -38,6 +38,7 @@ type httpBackend struct { LimitRules []*LimitRule Client *http.Client lock *sync.RWMutex + CacheOnly bool } type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool @@ -46,8 +47,8 @@ type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header // Both DomainRegexp and DomainGlob can be used to specify // the included domains patterns, but at least one is required. // There can be two kind of limitations: -// - Parallelism: Set limit for the number of concurrent requests to matching domains -// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case) +// - Parallelism: Set limit for the number of concurrent requests to matching domains +// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case) type LimitRule struct { // DomainRegexp is a regular expression to match against domains DomainRegexp string @@ -130,12 +131,12 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule { } func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) { - if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { + if (cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache") && !h.CacheOnly { return h.Do(request, bodySize, checkHeadersFunc) } sum := sha1.Sum([]byte(request.URL.String())) hash := hex.EncodeToString(sum[:]) - dir := path.Join(cacheDir, hash[:2]) + dir := path.Join(cacheDir, hash[39:40], hash[37:39]) filename := path.Join(dir, hash) if file, err := os.Open(filename); err == nil { resp := new(Response) @@ -146,6 +147,9 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFun return resp, err } } + if h.CacheOnly { + return nil, ErrCacheNotFound + } resp, err := h.Do(request, bodySize, checkHeadersFunc) if err != nil || resp.StatusCode >= 500 { return resp, err