Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cache only && optmize huge amount cache index #734

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ type Collector struct {
// TraceHTTP enables capturing and reporting request performance for crawler tuning.
// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
TraceHTTP bool
// CacheOnly will only returned cached request as required
CacheOnly bool
// Context is the context that will be used for HTTP requests. You can set this
// to support clean cancellation of scraping.
Context context.Context
Expand Down Expand Up @@ -228,6 +230,8 @@ var (
ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
// ErrQueueFull is the error returned when the queue is full
ErrQueueFull = errors.New("Queue MaxSize reached")
// ErrCacheNotFound is the error returned when cacheonly is turned on and the cache file is not found
ErrCacheNotFound = errors.New("Cache Not Found")
)

var envMap = map[string]func(*Collector, string){
Expand Down Expand Up @@ -277,6 +281,9 @@ var envMap = map[string]func(*Collector, string){
"USER_AGENT": func(c *Collector, val string) {
c.UserAgent = val
},
"CACHE_ONLY": func(c *Collector, val string) {
c.CacheOnly = isYesString(val)
},
}

var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
Expand Down Expand Up @@ -443,6 +450,12 @@ func CheckHead() CollectorOption {
}
}

func CacheOnly() CollectorOption {
return func(c *Collector) {
c.CacheOnly = true
}
}

// Init initializes the Collector's private variables and sets default
// configuration for the Collector
func (c *Collector) Init() {
Expand All @@ -456,6 +469,7 @@ func (c *Collector) Init() {
jar, _ := cookiejar.New(nil)
c.backend.Init(jar)
c.backend.Client.CheckRedirect = c.checkRedirectFunc()
c.backend.CacheOnly = c.CacheOnly
c.wg = &sync.WaitGroup{}
c.lock = &sync.RWMutex{}
c.robotsMap = make(map[string]*robotstxt.RobotsData)
Expand All @@ -469,13 +483,14 @@ func (c *Collector) Init() {
// With an Http.Client that is provided by appengine/urlfetch
// This function should be used when the scraper is run on
// Google App Engine. Example:
// func startScraper(w http.ResponseWriter, r *http.Request) {
// ctx := appengine.NewContext(r)
// c := colly.NewCollector()
// c.Appengine(ctx)
// ...
// c.Visit("https://google.ca")
// }
//
// func startScraper(w http.ResponseWriter, r *http.Request) {
// ctx := appengine.NewContext(r)
// c := colly.NewCollector()
// c.Appengine(ctx)
// ...
// c.Visit("https://google.ca")
// }
func (c *Collector) Appengine(ctx context.Context) {
client := urlfetch.Client(ctx)
client.Jar = c.backend.Client.Jar
Expand Down Expand Up @@ -972,6 +987,11 @@ func (c *Collector) SetClient(client *http.Client) {
c.backend.Client = client
}

// SetCacheOnly will only returned cached request as required
func (c *Collector) SetCacheOnly(val bool) {
c.backend.CacheOnly = val
}

// WithTransport allows you to set a custom http.RoundTripper (transport)
func (c *Collector) WithTransport(transport http.RoundTripper) {
c.backend.Client.Transport = transport
Expand Down
12 changes: 8 additions & 4 deletions http_backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type httpBackend struct {
LimitRules []*LimitRule
Client *http.Client
lock *sync.RWMutex
CacheOnly bool
}

type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool
Expand All @@ -46,8 +47,8 @@ type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header
// Both DomainRegexp and DomainGlob can be used to specify
// the included domains patterns, but at least one is required.
// There can be two kind of limitations:
// - Parallelism: Set limit for the number of concurrent requests to matching domains
// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
// - Parallelism: Set limit for the number of concurrent requests to matching domains
// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
type LimitRule struct {
// DomainRegexp is a regular expression to match against domains
DomainRegexp string
Expand Down Expand Up @@ -130,12 +131,12 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule {
}

func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) {
if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" {
if (cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache") && !h.CacheOnly {
return h.Do(request, bodySize, checkHeadersFunc)
}
sum := sha1.Sum([]byte(request.URL.String()))
hash := hex.EncodeToString(sum[:])
dir := path.Join(cacheDir, hash[:2])
dir := path.Join(cacheDir, hash[39:40], hash[37:39])
filename := path.Join(dir, hash)
if file, err := os.Open(filename); err == nil {
resp := new(Response)
Expand All @@ -146,6 +147,9 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFun
return resp, err
}
}
if h.CacheOnly {
return nil, ErrCacheNotFound
}
resp, err := h.Do(request, bodySize, checkHeadersFunc)
if err != nil || resp.StatusCode >= 500 {
return resp, err
Expand Down