-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(crawler): find sha1 by filename instead of using versions from metadata #29
Merged
knqyf263
merged 24 commits into
aquasecurity:main
from
DmitriyLewen:fix/crawl-by-jar.sha1-files
Apr 24, 2024
Merged
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit
Hold shift + click to select a range
6125e70
refactor(crawler): find sha1 as file `*.jar.sha1` instead of using ve…
DmitriyLewen f1ab72e
update test
DmitriyLewen 103569c
fix test
DmitriyLewen edab71e
revert: return check that page sha1 was not found
DmitriyLewen c10d096
refactor: remove error for wrong sha1. Add list of wrong sha1 to log
DmitriyLewen 4b4ea82
refactor: skip `sources`, `test`, `javadocs` jars
DmitriyLewen 3ed9e32
fix: use `title` to check sha1 file names
DmitriyLewen 37e4c01
refactor: use `href` instead of `title`
DmitriyLewen dd71b0a
ci(cron): add `ASzc/change-string-case-action@v6`
DmitriyLewen 1c42070
refactor: update parseMetadata function
DmitriyLewen 2d37d03
fix: check artifactID prefix when getting version from fileName
DmitriyLewen 4273026
fix: remove extra log
DmitriyLewen 004a75f
fix: use `href` to children dirs
DmitriyLewen 3e0b5c9
fix: exclude scaladoc files
DmitriyLewen b141985
fix: exclude `tests.jar.sha1` files
DmitriyLewen b317f38
fix typo
DmitriyLewen c42ce41
refactor: skip version metadata.xml files
DmitriyLewen ece2e3c
refactor: use text for maven-metadata.xml
DmitriyLewen 4a33686
fix tests
DmitriyLewen d0c50cb
refactor: use `href` to find children/metadata.xml
DmitriyLewen b6c2436
refactor + added logic to avoid overwriting dir version
DmitriyLewen 1d2eefc
fix: doesn't save dirVersion if sha1 doesn't exist
DmitriyLewen 2dddbfb
refactor(cron.yml): use bash to lowercase repo name
DmitriyLewen b358571
refactor(cron.yml): move lowercase repo name to upload assets step
DmitriyLewen File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,26 @@ | ||
package crawler | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"encoding/hex" | ||
"encoding/xml" | ||
"fmt" | ||
"github.com/aquasecurity/trivy-java-db/pkg/fileutil" | ||
"github.com/aquasecurity/trivy-java-db/pkg/types" | ||
"io" | ||
"log" | ||
"net/http" | ||
"net/url" | ||
"path/filepath" | ||
"strings" | ||
"sync" | ||
|
||
"github.com/PuerkitoBio/goquery" | ||
"github.com/hashicorp/go-retryablehttp" | ||
"github.com/samber/lo" | ||
"golang.org/x/sync/semaphore" | ||
"golang.org/x/xerrors" | ||
|
||
"github.com/aquasecurity/trivy-java-db/pkg/fileutil" | ||
"github.com/aquasecurity/trivy-java-db/pkg/types" | ||
) | ||
|
||
const mavenRepoURL = "https://repo.maven.apache.org/maven2/" | ||
|
@@ -27,10 +29,11 @@ type Crawler struct { | |
dir string | ||
http *retryablehttp.Client | ||
|
||
rootUrl string | ||
wg sync.WaitGroup | ||
urlCh chan string | ||
limit *semaphore.Weighted | ||
rootUrl string | ||
wg sync.WaitGroup | ||
urlCh chan string | ||
limit *semaphore.Weighted | ||
wrongSHA1Values []string | ||
} | ||
|
||
type Option struct { | ||
|
@@ -118,6 +121,12 @@ loop: | |
} | ||
} | ||
log.Println("Crawl completed") | ||
if len(c.wrongSHA1Values) > 0 { | ||
log.Println("Wrong sha1 files:") | ||
for _, wrongSHA1 := range c.wrongSHA1Values { | ||
log.Println(wrongSHA1) | ||
} | ||
} | ||
Comment on lines
+124
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An idea to better track wrong files: |
||
return nil | ||
} | ||
|
||
|
@@ -140,15 +149,14 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { | |
var children []string | ||
var foundMetadata bool | ||
d.Find("a").Each(func(i int, selection *goquery.Selection) { | ||
link := selection.Text() | ||
link := linkFromSelection(selection) | ||
if link == "maven-metadata.xml" { | ||
foundMetadata = true | ||
return | ||
} else if link == "../" || !strings.HasSuffix(link, "/") { | ||
// only `../` and dirs have `/` suffix. We don't need to check other files. | ||
return | ||
} | ||
|
||
children = append(children, link) | ||
}) | ||
|
||
|
@@ -158,7 +166,7 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { | |
return xerrors.Errorf("metadata parse error: %w", err) | ||
} | ||
if meta != nil { | ||
if err = c.crawlSHA1(ctx, url, meta); err != nil { | ||
if err = c.crawlSHA1(ctx, url, meta, children); err != nil { | ||
return err | ||
} | ||
// Return here since there is no need to crawl dirs anymore. | ||
|
@@ -183,34 +191,61 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { | |
return nil | ||
} | ||
|
||
func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata) error { | ||
var versions []Version | ||
for _, version := range meta.Versioning.Versions { | ||
// some metadata may contain characters that require escaping | ||
// for example <version>1.0.7?</version>: | ||
// https://repo.maven.apache.org/maven2/io/github/visal-99/b24paysdk/maven-metadata.xml | ||
version = url.QueryEscape(version) | ||
sha1FileName := fmt.Sprintf("/%s-%s.jar.sha1", url.QueryEscape(meta.ArtifactID), version) | ||
sha1, err := c.fetchSHA1(ctx, baseURL+version+sha1FileName) | ||
func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, dirs []string) error { | ||
var foundVersions []Version | ||
// Check each version dir to find links to `*.jar.sha1` files. | ||
for _, dir := range dirs { | ||
dirURL := baseURL + dir | ||
sha1Urls, err := c.sha1Urls(ctx, dirURL) | ||
if err != nil { | ||
return err | ||
return xerrors.Errorf("unable to get list of sha1 files from %q: %s", dirURL, err) | ||
} | ||
if len(sha1) != 0 { | ||
v := Version{ | ||
Version: version, | ||
SHA1: sha1, | ||
|
||
// Remove the `/` suffix to correctly compare file versions with version from directory name. | ||
dirVersion := strings.TrimSuffix(dir, "/") | ||
var dirVersionSha1 []byte | ||
var versions []Version | ||
for _, sha1Url := range sha1Urls { | ||
sha1, err := c.fetchSHA1(ctx, sha1Url) | ||
if err != nil { | ||
return xerrors.Errorf("unable to fetch sha1: %s", err) | ||
} | ||
if ver := versionFromSha1URL(meta.ArtifactID, sha1Url); ver != "" && len(sha1) != 0 { | ||
// Save sha1 for the file where the version is equal to the version from the directory name in order to remove duplicates later | ||
// Avoid overwriting dirVersion when inserting versions into the database (sha1 is uniq blob) | ||
// e.g. `cudf-0.14-cuda10-1.jar.sha1` should not overwrite `cudf-0.14.jar.sha1` | ||
// https://repo.maven.apache.org/maven2/ai/rapids/cudf/0.14/ | ||
if ver == dirVersion { | ||
dirVersionSha1 = sha1 | ||
} else { | ||
versions = append(versions, Version{ | ||
Version: ver, | ||
SHA1: sha1, | ||
}) | ||
} | ||
} | ||
versions = append(versions, v) | ||
} | ||
// Remove duplicates of dirVersionSha1 | ||
versions = lo.Filter(versions, func(v Version, _ int) bool { | ||
return !bytes.Equal(v.SHA1, dirVersionSha1) | ||
}) | ||
|
||
versions = append(versions, Version{ | ||
Version: dirVersion, | ||
SHA1: dirVersionSha1, | ||
}) | ||
|
||
foundVersions = append(foundVersions, versions...) | ||
} | ||
if len(versions) == 0 { | ||
|
||
if len(foundVersions) == 0 { | ||
return nil | ||
} | ||
|
||
index := &Index{ | ||
GroupID: meta.GroupID, | ||
ArtifactID: meta.ArtifactID, | ||
Versions: versions, | ||
Versions: foundVersions, | ||
ArchiveType: types.JarType, | ||
} | ||
fileName := fmt.Sprintf("%s.json", index.ArtifactID) | ||
|
@@ -221,7 +256,45 @@ func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata) | |
return nil | ||
} | ||
|
||
func (c *Crawler) sha1Urls(ctx context.Context, url string) ([]string, error) { | ||
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) | ||
if err != nil { | ||
return nil, xerrors.Errorf("unable to new HTTP request: %w", err) | ||
} | ||
resp, err := c.http.Do(req) | ||
if err != nil { | ||
return nil, xerrors.Errorf("http get error (%s): %w", url, err) | ||
} | ||
defer func() { _ = resp.Body.Close() }() | ||
|
||
d, err := goquery.NewDocumentFromReader(resp.Body) | ||
if err != nil { | ||
return nil, xerrors.Errorf("can't create new goquery doc: %w", err) | ||
} | ||
|
||
// Version dir may contain multiple `*jar.sha1` files. | ||
// e.g. https://repo1.maven.org/maven2/org/jasypt/jasypt/1.9.3/ | ||
// We need to take all links. | ||
var sha1URLs []string | ||
d.Find("a").Each(func(i int, selection *goquery.Selection) { | ||
link := linkFromSelection(selection) | ||
// Don't include sources, test, javadocs, scaladoc files | ||
if strings.HasSuffix(link, ".jar.sha1") && !strings.HasSuffix(link, "sources.jar.sha1") && | ||
!strings.HasSuffix(link, "test.jar.sha1") && !strings.HasSuffix(link, "tests.jar.sha1") && | ||
!strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") { | ||
sha1URLs = append(sha1URLs, url+link) | ||
} | ||
}) | ||
return sha1URLs, nil | ||
} | ||
|
||
func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, error) { | ||
// We need to skip metadata.xml files from groupID folder | ||
// e.g. https://repo.maven.apache.org/maven2/args4j/maven-metadata.xml | ||
if len(strings.Split(url, "/")) < 7 { | ||
return nil, nil | ||
} | ||
|
||
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) | ||
if err != nil { | ||
return nil, xerrors.Errorf("unable to new HTTP request: %w", err) | ||
|
@@ -236,16 +309,17 @@ func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, err | |
if err = xml.NewDecoder(resp.Body).Decode(&meta); err != nil { | ||
return nil, xerrors.Errorf("%s decode error: %w", url, err) | ||
} | ||
// Skip metadata without `GroupID` and ArtifactID` fields | ||
// e.g. https://repo.maven.apache.org/maven2/at/molindo/maven-metadata.xml | ||
if meta.ArtifactID == "" || meta.GroupID == "" { | ||
return nil, nil | ||
} | ||
|
||
// we don't need metadata.xml files from version folder | ||
// e.g. https://repo.maven.apache.org/maven2/HTTPClient/HTTPClient/0.3-3/maven-metadata.xml | ||
if len(meta.Versioning.Versions) == 0 { | ||
return nil, nil | ||
} | ||
// also we need to skip metadata.xml files from groupID folder | ||
// e.g. https://repo.maven.apache.org/maven2/args4j/maven-metadata.xml | ||
if len(strings.Split(url, "/")) < 7 { | ||
return nil, nil | ||
} | ||
return &meta, nil | ||
} | ||
|
||
|
@@ -258,9 +332,11 @@ func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) { | |
if err != nil { | ||
return nil, xerrors.Errorf("http get error (%s): %w", url, err) | ||
} | ||
defer resp.Body.Close() | ||
defer func() { _ = resp.Body.Close() }() | ||
|
||
// some projects don't have xxx.jar and xxx.jar.sha1 files | ||
// These are cases when version dir contains link to sha1 file | ||
// But file doesn't exist | ||
// e.g. https://repo.maven.apache.org/maven2/com/adobe/aem/uber-jar/6.4.8.2/uber-jar-6.4.8.2-sources.jar.sha1 | ||
if resp.StatusCode == http.StatusNotFound { | ||
return nil, nil // TODO add special error for this | ||
} | ||
|
@@ -287,7 +363,33 @@ func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) { | |
} | ||
} | ||
if len(sha1b) == 0 { | ||
return nil, xerrors.Errorf("failed to decode sha1 %s: %w", url, err) | ||
c.wrongSHA1Values = append(c.wrongSHA1Values, fmt.Sprintf("%s (%s)", url, err)) | ||
return nil, nil | ||
} | ||
return sha1b, nil | ||
} | ||
|
||
func versionFromSha1URL(artifactId, sha1URL string) string { | ||
ss := strings.Split(sha1URL, "/") | ||
fileName := ss[len(ss)-1] | ||
if !strings.HasPrefix(fileName, artifactId) { | ||
return "" | ||
} | ||
return strings.TrimSuffix(strings.TrimPrefix(fileName, artifactId+"-"), ".jar.sha1") | ||
} | ||
|
||
// linkFromSelection returns the link from goquery.Selection. | ||
// There are times when maven breaks `text` - it removes part of the `text` and adds the suffix `...` (`.../` for dirs). | ||
// e.g. `<a href="v1.1.0-226-g847ecff2d8e26f249422247d7665fe15f07b1744/">v1.1.0-226-g847ecff2d8e26f249422247d7665fe15.../</a>` | ||
// In this case we should take `href`. | ||
// But we don't need to get `href` if the text isn't broken. | ||
// To avoid checking unnecessary links. | ||
// e.g. `<pre id="contents"><a href="https://repo.maven.apache.org/maven2/abbot/">../</a>` | ||
func linkFromSelection(selection *goquery.Selection) string { | ||
link := selection.Text() | ||
// maven uses `.../` suffix for dirs and `...` suffix for files. | ||
if href, ok := selection.Attr("href"); ok && (strings.HasSuffix(link, ".../") || (strings.HasSuffix(link, "..."))) { | ||
link = href | ||
} | ||
return link | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not a required change. But it will help me (and other contributors with capital letters in their names) work with the changes from the fork.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if using bash? To reduce the risk of supply chain attacks, I want to avoid introducing third-party actions unless it's truly necessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no problem. changed in 2dddbfb
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now, we don't need this step and can merge it into the uploading step.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right! Thanks!
Changed b358571
Test run - https://github.com/DmitriyLewen/trivy-java-db/actions/runs/8794956548