Skip to content

Commit

Permalink
feat(crawler): add canceled context to Visit func (#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
DmitriyLewen authored Feb 26, 2024
1 parent faf1565 commit be4b443
Showing 1 changed file with 36 additions and 14 deletions.
50 changes: 36 additions & 14 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ func NewCrawler(opt Option) Crawler {

func (c *Crawler) Crawl(ctx context.Context) error {
log.Println("Crawl maven repository and save indexes")
ctx, cancel := context.WithCancel(ctx)
defer cancel()

errCh := make(chan error)
defer close(errCh)

Expand Down Expand Up @@ -94,7 +97,7 @@ func (c *Crawler) Crawl(ctx context.Context) error {
go func(url string) {
defer c.limit.Release(1)
defer c.wg.Done()
if err := c.Visit(url); err != nil {
if err := c.Visit(ctx, url); err != nil {
errCh <- xerrors.Errorf("visit error: %w", err)
}
}(url)
Expand All @@ -108,6 +111,7 @@ loop:
case <-crawlDone:
break loop
case err := <-errCh:
cancel() // Stop all running Visit functions to avoid writing to closed c.urlCh.
close(c.urlCh)
return err

Expand All @@ -117,8 +121,12 @@ loop:
return nil
}

func (c *Crawler) Visit(url string) error {
resp, err := c.http.Get(url)
func (c *Crawler) Visit(ctx context.Context, url string) error {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return xerrors.Errorf("http get error (%s): %w", url, err)
}
Expand All @@ -145,12 +153,12 @@ func (c *Crawler) Visit(url string) error {
})

if foundMetadata {
meta, err := c.parseMetadata(url + "maven-metadata.xml")
meta, err := c.parseMetadata(ctx, url+"maven-metadata.xml")
if err != nil {
return xerrors.Errorf("metadata parse error: %w", err)
}
if meta != nil {
if err = c.crawlSHA1(url, meta); err != nil {
if err = c.crawlSHA1(ctx, url, meta); err != nil {
return err
}
// Return here since there is no need to crawl dirs anymore.
Expand All @@ -162,22 +170,28 @@ func (c *Crawler) Visit(url string) error {

go func() {
for _, child := range children {
c.urlCh <- url + child
select {
// Context can be canceled if we receive an error from another Visit function.
case <-ctx.Done():
return
default:
c.urlCh <- url + child
}
}
}()

return nil
}

func (c *Crawler) crawlSHA1(baseURL string, meta *Metadata) error {
func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata) error {
var versions []Version
for _, version := range meta.Versioning.Versions {
// some metadata may contain characters that require escaping
// for example <version>1.0.7?</version>:
// https://repo.maven.apache.org/maven2/io/github/visal-99/b24paysdk/maven-metadata.xml
version = url.QueryEscape(version)
sha1FileName := fmt.Sprintf("/%s-%s.jar.sha1", url.QueryEscape(meta.ArtifactID), version)
sha1, err := c.fetchSHA1(baseURL + version + sha1FileName)
sha1, err := c.fetchSHA1(ctx, baseURL+version+sha1FileName)
if err != nil {
return err
}
Expand Down Expand Up @@ -207,10 +221,14 @@ func (c *Crawler) crawlSHA1(baseURL string, meta *Metadata) error {
return nil
}

func (c *Crawler) parseMetadata(url string) (*Metadata, error) {
resp, err := c.http.Get(url)
func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, error) {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, xerrors.Errorf("can't get url: %w", err)
return nil, xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return nil, xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()

Expand All @@ -231,10 +249,14 @@ func (c *Crawler) parseMetadata(url string) (*Metadata, error) {
return &meta, nil
}

func (c *Crawler) fetchSHA1(url string) ([]byte, error) {
resp, err := c.http.Get(url)
func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return nil, xerrors.Errorf("can't get sha1 from %s: %w", url, err)
return nil, xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()

Expand Down

0 comments on commit be4b443

Please sign in to comment.