Skip to content

Commit

Permalink
retry redirect to AlreadyVisitedUrl will loop error
Browse files Browse the repository at this point in the history
  • Loading branch information
Shinku-Chen committed Apr 8, 2024
1 parent 5224b97 commit 93b2a30
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 2 deletions.
10 changes: 8 additions & 2 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,10 @@ var collectorCounter uint32
type key int

// ProxyURLKey is the context key for the request proxy address.
const ProxyURLKey key = iota
const (
ProxyURLKey key = iota
CheckRevisitKey
)

var (
// ErrForbiddenDomain is the error thrown if visiting
Expand Down Expand Up @@ -650,6 +653,7 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
}
// note: once 1.13 is minimum supported Go version,
// replace this with http.NewRequestWithContext
c.Context = context.WithValue(c.Context, CheckRevisitKey, checkRevisit)
req = req.WithContext(c.Context)
if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil {
return err
Expand Down Expand Up @@ -1382,7 +1386,9 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
return err
}
if visited {
return &AlreadyVisitedError{req.URL}
if checkRevisit, ok := req.Context().Value(CheckRevisitKey).(bool); !ok || checkRevisit {
return &AlreadyVisitedError{req.URL}
}
}
err = c.store.Visited(uHash)
if err != nil {
Expand Down
22 changes: 22 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1814,3 +1814,25 @@ func TestCollectorPostRetryUnseekable(t *testing.T) {
t.Error("OnResponse Retry was called but BodyUnseekable")
}
}

func TestRedirectErrorRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnError(func(r *Response, err error) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if e := (&AlreadyVisitedError{}); errors.As(err, &e) {
t.Error("loop AlreadyVisitedError")
}

})
c.OnResponse(func(response *Response) {
//println(1)
})
c.Visit(ts.URL + "/redirected/")
c.Visit(ts.URL + "/redirect")
}

0 comments on commit 93b2a30

Please sign in to comment.