Skip to content

Commit

Permalink
fix(crawler): find sha1 by filename instead of using versions from me…
Browse files Browse the repository at this point in the history
…tadata (#29)
  • Loading branch information
DmitriyLewen authored Apr 24, 2024
1 parent be4b443 commit ab39d06
Show file tree
Hide file tree
Showing 14 changed files with 232 additions and 120 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ jobs:

- name: Upload assets to GHCR
run: |
lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
oras version
oras push --artifact-type application/vnd.aquasec.trivy.config.v1+json \
ghcr.io/${{ github.repository }}:${DB_VERSION} \
ghcr.io/${lowercase_repo}:${DB_VERSION} \
javadb.tar.gz:application/vnd.aquasec.trivy.javadb.layer.v1.tar+gzip
10 changes: 6 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/cheggaaa/pb/v3 v3.1.0
github.com/hashicorp/go-retryablehttp v0.7.2
github.com/samber/lo v1.39.0
github.com/spf13/cobra v1.6.1
github.com/stretchr/testify v1.8.1
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2
k8s.io/utils v0.0.0-20230115233650-391b47cb4029
modernc.org/sqlite v1.20.3
Expand All @@ -31,10 +32,11 @@ require (
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/mod v0.3.0 // indirect
golang.org/x/net v0.0.0-20201021035429-f5854403a974 // indirect
golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect
golang.org/x/mod v0.6.0-dev.0.20211013180041-c96bc1413d57 // indirect
golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f // indirect
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect
golang.org/x/tools v0.0.0-20201124115921-2c860bdd6e78 // indirect
golang.org/x/tools v0.1.8-0.20211029000441-d6a9af8af023 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
lukechampine.com/uint128 v1.2.0 // indirect
modernc.org/cc/v3 v3.40.0 // indirect
Expand Down
34 changes: 12 additions & 22 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA=
github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA=
github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA=
github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
Expand All @@ -57,38 +59,26 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 h1:3MTrJm4PyNL9NBqvYDSj3DHl46qQakyfqfWo4jgfaEM=
golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17/go.mod h1:lgLbSvA5ygNOMpwM/9anMpWVlVJ7Z+cHWq/eFuinpGE=
golang.org/x/mod v0.6.0-dev.0.20211013180041-c96bc1413d57 h1:LQmS1nU0twXLA96Kt7U9qtHJEbBk3z6Q0V4UXjZkpr4=
golang.org/x/mod v0.6.0-dev.0.20211013180041-c96bc1413d57/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 h1:SQFwaSi55rU7vdNs9Yr0Z324VNlrF+0wMqRXT4St8ck=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f h1:OfiFi4JbukWwe3lzw+xunroH1mnC1e2Gy5cxNJApiSY=
golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20201124115921-2c860bdd6e78 h1:M8tBwCtWD/cZV9DZpFYRUgaymAYAr+aIUTWzDaM3uPs=
golang.org/x/tools v0.0.0-20201124115921-2c860bdd6e78/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/tools v0.1.8-0.20211029000441-d6a9af8af023 h1:0c3L82FDQ5rt1bjTBlchS8t6RQ6299/+5bWMnRLh+uI=
golang.org/x/tools v0.1.8-0.20211029000441-d6a9af8af023/go.mod h1:nABZi5QlRsZVlzPpHl034qft6wpY4eDcsTt5AaioBiU=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
Expand Down
174 changes: 139 additions & 35 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
package crawler

import (
"bytes"
"context"
"encoding/hex"
"encoding/xml"
"fmt"
"github.com/aquasecurity/trivy-java-db/pkg/fileutil"
"github.com/aquasecurity/trivy-java-db/pkg/types"
"io"
"log"
"net/http"
"net/url"
"path/filepath"
"strings"
"sync"

"github.com/PuerkitoBio/goquery"
"github.com/hashicorp/go-retryablehttp"
"github.com/samber/lo"
"golang.org/x/sync/semaphore"
"golang.org/x/xerrors"

"github.com/aquasecurity/trivy-java-db/pkg/fileutil"
"github.com/aquasecurity/trivy-java-db/pkg/types"
)

const mavenRepoURL = "https://repo.maven.apache.org/maven2/"
Expand All @@ -27,10 +29,11 @@ type Crawler struct {
dir string
http *retryablehttp.Client

rootUrl string
wg sync.WaitGroup
urlCh chan string
limit *semaphore.Weighted
rootUrl string
wg sync.WaitGroup
urlCh chan string
limit *semaphore.Weighted
wrongSHA1Values []string
}

type Option struct {
Expand Down Expand Up @@ -118,6 +121,12 @@ loop:
}
}
log.Println("Crawl completed")
if len(c.wrongSHA1Values) > 0 {
log.Println("Wrong sha1 files:")
for _, wrongSHA1 := range c.wrongSHA1Values {
log.Println(wrongSHA1)
}
}
return nil
}

Expand All @@ -140,15 +149,14 @@ func (c *Crawler) Visit(ctx context.Context, url string) error {
var children []string
var foundMetadata bool
d.Find("a").Each(func(i int, selection *goquery.Selection) {
link := selection.Text()
link := linkFromSelection(selection)
if link == "maven-metadata.xml" {
foundMetadata = true
return
} else if link == "../" || !strings.HasSuffix(link, "/") {
// only `../` and dirs have `/` suffix. We don't need to check other files.
return
}

children = append(children, link)
})

Expand All @@ -158,7 +166,7 @@ func (c *Crawler) Visit(ctx context.Context, url string) error {
return xerrors.Errorf("metadata parse error: %w", err)
}
if meta != nil {
if err = c.crawlSHA1(ctx, url, meta); err != nil {
if err = c.crawlSHA1(ctx, url, meta, children); err != nil {
return err
}
// Return here since there is no need to crawl dirs anymore.
Expand All @@ -183,34 +191,63 @@ func (c *Crawler) Visit(ctx context.Context, url string) error {
return nil
}

func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata) error {
var versions []Version
for _, version := range meta.Versioning.Versions {
// some metadata may contain characters that require escaping
// for example <version>1.0.7?</version>:
// https://repo.maven.apache.org/maven2/io/github/visal-99/b24paysdk/maven-metadata.xml
version = url.QueryEscape(version)
sha1FileName := fmt.Sprintf("/%s-%s.jar.sha1", url.QueryEscape(meta.ArtifactID), version)
sha1, err := c.fetchSHA1(ctx, baseURL+version+sha1FileName)
func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, dirs []string) error {
var foundVersions []Version
// Check each version dir to find links to `*.jar.sha1` files.
for _, dir := range dirs {
dirURL := baseURL + dir
sha1Urls, err := c.sha1Urls(ctx, dirURL)
if err != nil {
return err
return xerrors.Errorf("unable to get list of sha1 files from %q: %s", dirURL, err)
}
if len(sha1) != 0 {
v := Version{
Version: version,
SHA1: sha1,

// Remove the `/` suffix to correctly compare file versions with version from directory name.
dirVersion := strings.TrimSuffix(dir, "/")
var dirVersionSha1 []byte
var versions []Version
for _, sha1Url := range sha1Urls {
sha1, err := c.fetchSHA1(ctx, sha1Url)
if err != nil {
return xerrors.Errorf("unable to fetch sha1: %s", err)
}
versions = append(versions, v)
if ver := versionFromSha1URL(meta.ArtifactID, sha1Url); ver != "" && len(sha1) != 0 {
// Save sha1 for the file where the version is equal to the version from the directory name in order to remove duplicates later
// Avoid overwriting dirVersion when inserting versions into the database (sha1 is uniq blob)
// e.g. `cudf-0.14-cuda10-1.jar.sha1` should not overwrite `cudf-0.14.jar.sha1`
// https://repo.maven.apache.org/maven2/ai/rapids/cudf/0.14/
if ver == dirVersion {
dirVersionSha1 = sha1
} else {
versions = append(versions, Version{
Version: ver,
SHA1: sha1,
})
}
}
}
// Remove duplicates of dirVersionSha1
versions = lo.Filter(versions, func(v Version, _ int) bool {
return !bytes.Equal(v.SHA1, dirVersionSha1)
})

if dirVersionSha1 != nil {
versions = append(versions, Version{
Version: dirVersion,
SHA1: dirVersionSha1,
})
}

foundVersions = append(foundVersions, versions...)
}
if len(versions) == 0 {

if len(foundVersions) == 0 {
return nil
}

index := &Index{
GroupID: meta.GroupID,
ArtifactID: meta.ArtifactID,
Versions: versions,
Versions: foundVersions,
ArchiveType: types.JarType,
}
fileName := fmt.Sprintf("%s.json", index.ArtifactID)
Expand All @@ -221,7 +258,45 @@ func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata)
return nil
}

func (c *Crawler) sha1Urls(ctx context.Context, url string) ([]string, error) {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return nil, xerrors.Errorf("http get error (%s): %w", url, err)
}
defer func() { _ = resp.Body.Close() }()

d, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, xerrors.Errorf("can't create new goquery doc: %w", err)
}

// Version dir may contain multiple `*jar.sha1` files.
// e.g. https://repo1.maven.org/maven2/org/jasypt/jasypt/1.9.3/
// We need to take all links.
var sha1URLs []string
d.Find("a").Each(func(i int, selection *goquery.Selection) {
link := linkFromSelection(selection)
// Don't include sources, test, javadocs, scaladoc files
if strings.HasSuffix(link, ".jar.sha1") && !strings.HasSuffix(link, "sources.jar.sha1") &&
!strings.HasSuffix(link, "test.jar.sha1") && !strings.HasSuffix(link, "tests.jar.sha1") &&
!strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") {
sha1URLs = append(sha1URLs, url+link)
}
})
return sha1URLs, nil
}

func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, error) {
// We need to skip metadata.xml files from groupID folder
// e.g. https://repo.maven.apache.org/maven2/args4j/maven-metadata.xml
if len(strings.Split(url, "/")) < 7 {
return nil, nil
}

req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, xerrors.Errorf("unable to new HTTP request: %w", err)
Expand All @@ -236,16 +311,17 @@ func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, err
if err = xml.NewDecoder(resp.Body).Decode(&meta); err != nil {
return nil, xerrors.Errorf("%s decode error: %w", url, err)
}
// Skip metadata without `GroupID` and ArtifactID` fields
// e.g. https://repo.maven.apache.org/maven2/at/molindo/maven-metadata.xml
if meta.ArtifactID == "" || meta.GroupID == "" {
return nil, nil
}

// we don't need metadata.xml files from version folder
// e.g. https://repo.maven.apache.org/maven2/HTTPClient/HTTPClient/0.3-3/maven-metadata.xml
if len(meta.Versioning.Versions) == 0 {
return nil, nil
}
// also we need to skip metadata.xml files from groupID folder
// e.g. https://repo.maven.apache.org/maven2/args4j/maven-metadata.xml
if len(strings.Split(url, "/")) < 7 {
return nil, nil
}
return &meta, nil
}

Expand All @@ -258,9 +334,11 @@ func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) {
if err != nil {
return nil, xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()
defer func() { _ = resp.Body.Close() }()

// some projects don't have xxx.jar and xxx.jar.sha1 files
// These are cases when version dir contains link to sha1 file
// But file doesn't exist
// e.g. https://repo.maven.apache.org/maven2/com/adobe/aem/uber-jar/6.4.8.2/uber-jar-6.4.8.2-sources.jar.sha1
if resp.StatusCode == http.StatusNotFound {
return nil, nil // TODO add special error for this
}
Expand All @@ -287,7 +365,33 @@ func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) {
}
}
if len(sha1b) == 0 {
return nil, xerrors.Errorf("failed to decode sha1 %s: %w", url, err)
c.wrongSHA1Values = append(c.wrongSHA1Values, fmt.Sprintf("%s (%s)", url, err))
return nil, nil
}
return sha1b, nil
}

func versionFromSha1URL(artifactId, sha1URL string) string {
ss := strings.Split(sha1URL, "/")
fileName := ss[len(ss)-1]
if !strings.HasPrefix(fileName, artifactId) {
return ""
}
return strings.TrimSuffix(strings.TrimPrefix(fileName, artifactId+"-"), ".jar.sha1")
}

// linkFromSelection returns the link from goquery.Selection.
// There are times when maven breaks `text` - it removes part of the `text` and adds the suffix `...` (`.../` for dirs).
// e.g. `<a href="v1.1.0-226-g847ecff2d8e26f249422247d7665fe15f07b1744/">v1.1.0-226-g847ecff2d8e26f249422247d7665fe15.../</a>`
// In this case we should take `href`.
// But we don't need to get `href` if the text isn't broken.
// To avoid checking unnecessary links.
// e.g. `<pre id="contents"><a href="https://repo.maven.apache.org/maven2/abbot/">../</a>`
func linkFromSelection(selection *goquery.Selection) string {
link := selection.Text()
// maven uses `.../` suffix for dirs and `...` suffix for files.
if href, ok := selection.Attr("href"); ok && (strings.HasSuffix(link, ".../") || (strings.HasSuffix(link, "..."))) {
link = href
}
return link
}
Loading

0 comments on commit ab39d06

Please sign in to comment.