From 9bfee6b9251f9515cd079fc493b63398cd46c2bb Mon Sep 17 00:00:00 2001
From: Link <linkec@vip.qq.com>
Date: Sun, 23 Oct 2022 22:46:46 +0800
Subject: [PATCH] add cache only && optmize huge amount cache index

---
 colly.go        | 34 +++++++++++++++++++++++++++-------
 http_backend.go | 12 ++++++++----
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/colly.go b/colly.go
index 97febe516..4eb246a19 100644
--- a/colly.go
+++ b/colly.go
@@ -112,6 +112,8 @@ type Collector struct {
 	// TraceHTTP enables capturing and reporting request performance for crawler tuning.
 	// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
 	TraceHTTP bool
+	// CacheOnly will only returned cached request as required
+	CacheOnly bool
 	// Context is the context that will be used for HTTP requests. You can set this
 	// to support clean cancellation of scraping.
 	Context context.Context
@@ -228,6 +230,8 @@ var (
 	ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
 	// ErrQueueFull is the error returned when the queue is full
 	ErrQueueFull = errors.New("Queue MaxSize reached")
+	// ErrCacheNotFound is the error returned when cacheonly is turned on and the cache file is not found
+	ErrCacheNotFound = errors.New("Cache Not Found")
 )
 
 var envMap = map[string]func(*Collector, string){
@@ -277,6 +281,9 @@ var envMap = map[string]func(*Collector, string){
 	"USER_AGENT": func(c *Collector, val string) {
 		c.UserAgent = val
 	},
+	"CACHE_ONLY": func(c *Collector, val string) {
+		c.CacheOnly = isYesString(val)
+	},
 }
 
 var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
@@ -443,6 +450,12 @@ func CheckHead() CollectorOption {
 	}
 }
 
+func CacheOnly() CollectorOption {
+	return func(c *Collector) {
+		c.CacheOnly = true
+	}
+}
+
 // Init initializes the Collector's private variables and sets default
 // configuration for the Collector
 func (c *Collector) Init() {
@@ -456,6 +469,7 @@ func (c *Collector) Init() {
 	jar, _ := cookiejar.New(nil)
 	c.backend.Init(jar)
 	c.backend.Client.CheckRedirect = c.checkRedirectFunc()
+	c.backend.CacheOnly = c.CacheOnly
 	c.wg = &sync.WaitGroup{}
 	c.lock = &sync.RWMutex{}
 	c.robotsMap = make(map[string]*robotstxt.RobotsData)
@@ -469,13 +483,14 @@ func (c *Collector) Init() {
 // With an Http.Client that is provided by appengine/urlfetch
 // This function should be used when the scraper is run on
 // Google App Engine. Example:
-//   func startScraper(w http.ResponseWriter, r *http.Request) {
-//     ctx := appengine.NewContext(r)
-//     c := colly.NewCollector()
-//     c.Appengine(ctx)
-//      ...
-//     c.Visit("https://google.ca")
-//   }
+//
+//	func startScraper(w http.ResponseWriter, r *http.Request) {
+//	  ctx := appengine.NewContext(r)
+//	  c := colly.NewCollector()
+//	  c.Appengine(ctx)
+//	   ...
+//	  c.Visit("https://google.ca")
+//	}
 func (c *Collector) Appengine(ctx context.Context) {
 	client := urlfetch.Client(ctx)
 	client.Jar = c.backend.Client.Jar
@@ -972,6 +987,11 @@ func (c *Collector) SetClient(client *http.Client) {
 	c.backend.Client = client
 }
 
+// SetCacheOnly will only returned cached request as required
+func (c *Collector) SetCacheOnly(val bool) {
+	c.backend.CacheOnly = val
+}
+
 // WithTransport allows you to set a custom http.RoundTripper (transport)
 func (c *Collector) WithTransport(transport http.RoundTripper) {
 	c.backend.Client.Transport = transport
diff --git a/http_backend.go b/http_backend.go
index 926046b2d..d6db30e41 100644
--- a/http_backend.go
+++ b/http_backend.go
@@ -38,6 +38,7 @@ type httpBackend struct {
 	LimitRules []*LimitRule
 	Client     *http.Client
 	lock       *sync.RWMutex
+	CacheOnly  bool
 }
 
 type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool
@@ -46,8 +47,8 @@ type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header
 // Both DomainRegexp and DomainGlob can be used to specify
 // the included domains patterns, but at least one is required.
 // There can be two kind of limitations:
-//  - Parallelism: Set limit for the number of concurrent requests to matching domains
-//  - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
+//   - Parallelism: Set limit for the number of concurrent requests to matching domains
+//   - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
 type LimitRule struct {
 	// DomainRegexp is a regular expression to match against domains
 	DomainRegexp string
@@ -130,12 +131,12 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule {
 }
 
 func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) {
-	if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" {
+	if (cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache") && !h.CacheOnly {
 		return h.Do(request, bodySize, checkHeadersFunc)
 	}
 	sum := sha1.Sum([]byte(request.URL.String()))
 	hash := hex.EncodeToString(sum[:])
-	dir := path.Join(cacheDir, hash[:2])
+	dir := path.Join(cacheDir, hash[39:40], hash[37:39])
 	filename := path.Join(dir, hash)
 	if file, err := os.Open(filename); err == nil {
 		resp := new(Response)
@@ -146,6 +147,9 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFun
 			return resp, err
 		}
 	}
+	if h.CacheOnly {
+		return nil, ErrCacheNotFound
+	}
 	resp, err := h.Do(request, bodySize, checkHeadersFunc)
 	if err != nil || resp.StatusCode >= 500 {
 		return resp, err