Skip to content

Commit

Permalink
Introduce indexed embedded CPE dictionary (anchore#1897)
Browse files Browse the repository at this point in the history
* Introduce indexed embedded CPE dictionary

Signed-off-by: Dan Luhring <[email protected]>

* Don't generate cpe-index on make snapshot

Signed-off-by: Dan Luhring <[email protected]>

* Add unit tests for individual addEntry funcs

Signed-off-by: Dan Luhring <[email protected]>

* migrate CPE index build to go generate and add periodic workflow

Signed-off-by: Alex Goodman <[email protected]>

* add test to ensure generated cpe index is wired up to function that uses it

Signed-off-by: Alex Goodman <[email protected]>

---------

Signed-off-by: Dan Luhring <[email protected]>
Signed-off-by: Alex Goodman <[email protected]>
Co-authored-by: Alex Goodman <[email protected]>
  • Loading branch information
luhring and wagoodman authored Jul 21, 2023
1 parent a72e3d3 commit abca89c
Show file tree
Hide file tree
Showing 16 changed files with 26,855 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/update-bootstrap-tools.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
workflow_dispatch:

env:
GO_VERSION: "1.19.x"
GO_VERSION: "1.20.x"
GO_STABLE_VERSION: true

jobs:
Expand Down
43 changes: 43 additions & 0 deletions .github/workflows/update-cpe-dictionary-index.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: PR to update CPE dictionary index
on:
schedule:
- cron: "0 1 * * 1" # every monday at 1 AM

workflow_dispatch:

env:
GO_VERSION: "1.20.x"
GO_STABLE_VERSION: true

jobs:
upgrade-cpe-dictionary-index:
runs-on: ubuntu-latest
if: github.repository == 'anchore/syft' # only run for main repo
steps:
- uses: actions/checkout@v3

- uses: actions/setup-go@v4
with:
go-version: ${{ env.GO_VERSION }}
stable: ${{ env.GO_STABLE_VERSION }}

- run: |
make generate-cpe-dictionary-index
- uses: tibdex/github-app-token@v1
id: generate-token
with:
app_id: ${{ secrets.TOKEN_APP_ID }}
private_key: ${{ secrets.TOKEN_APP_PRIVATE_KEY }}

- uses: peter-evans/create-pull-request@v5
with:
signoff: true
delete-branch: true
branch: auto/latest-cpe-dictionary-index
labels: dependencies
commit-message: "chore(deps): update CPE dictionary index"
title: "chore(deps): update CPE dictionary index"
body: |
Update CPE dictionary index based on the latest available CPE dictionary
token: ${{ steps.generate-token.outputs.token }}
2 changes: 1 addition & 1 deletion .github/workflows/update-stereoscope-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
workflow_dispatch:

env:
GO_VERSION: "1.19.x"
GO_VERSION: "1.20.x"
GO_STABLE_VERSION: true

jobs:
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ compare-test-rpm-package-install: $(TEMP_DIR) $(SNAPSHOT_DIR)
$(TEMP_DIR)


## Code generation targets #################################
## Code and data generation targets #################################

.PHONY: generate-json-schema
generate-json-schema: ## Generate a new json schema
Expand All @@ -309,6 +309,11 @@ generate-license-list: ## Generate an updated spdx license list
go generate ./internal/spdxlicense/...
gofmt -s -w ./internal/spdxlicense

.PHONY: generate-cpe-dictionary-index
generate-cpe-dictionary-index: ## Build the CPE index based off of the latest available CPE dictionary
$(call title,Building CPE index)
go generate ./syft/pkg/cataloger/common/cpe/dictionary


## Build-related targets #################################

Expand Down
9 changes: 8 additions & 1 deletion syft/pkg/cataloger/catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,14 @@ func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerRes
for _, p := range packages {
// generate CPEs (note: this is excluded from package ID, so is safe to mutate)
// we might have binary classified CPE already with the package so we want to append here
p.CPEs = append(p.CPEs, cpe.Generate(p)...)

dictionaryCPE, ok := cpe.DictionaryFind(p)
if ok {
log.Debugf("used CPE dictionary to find CPE for %s package %q: %s", p.Type, p.Name, dictionaryCPE.BindToFmtString())
p.CPEs = append(p.CPEs, dictionaryCPE)
} else {
p.CPEs = append(p.CPEs, cpe.Generate(p)...)
}

// if we were not able to identify the language we have an opportunity
// to try and get this value from the PURL. Worst case we assert that
Expand Down
1,296 changes: 1,296 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/data/cpe-index.json

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/generate_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package dictionary

//go:generate go run ./index-generator/ -o data/cpe-index.json
230 changes: 230 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/index-generator/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
package main

import (
"compress/gzip"
"encoding/json"
"encoding/xml"
"fmt"
"io"
"log"
"strings"

"github.com/facebookincubator/nvdtools/wfn"
"golang.org/x/exp/slices"

"github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary"
)

func generateIndexedDictionaryJSON(rawGzipData io.Reader) ([]byte, error) {
gzipReader, err := gzip.NewReader(rawGzipData)
if err != nil {
return nil, fmt.Errorf("unable to decompress CPE dictionary: %w", err)
}
defer gzipReader.Close()

// Read XML data
data, err := io.ReadAll(gzipReader)
if err != nil {
return nil, fmt.Errorf("unable to read CPE dictionary: %w", err)
}

// Unmarshal XML
var cpeList CpeList
if err := xml.Unmarshal(data, &cpeList); err != nil {
return nil, fmt.Errorf("unable to unmarshal CPE dictionary XML: %w", err)
}

// Filter out data that's not applicable here
cpeList = filterCpeList(cpeList)

// Create indexed dictionary to help with looking up CPEs
indexedDictionary := indexCPEList(cpeList)

// Convert to JSON
jsonData, err := json.MarshalIndent(indexedDictionary, "", " ")
if err != nil {
return nil, fmt.Errorf("unable to marshal CPE dictionary to JSON: %w", err)
}
return jsonData, nil
}

// filterCpeList removes CPE items that are not applicable to software packages.
func filterCpeList(cpeList CpeList) CpeList {
var processedCpeList CpeList

seen := make(map[string]struct{})

for _, cpeItem := range cpeList.CpeItems {
// Skip CPE items that don't have any references.
if len(cpeItem.References) == 0 {
continue
}

// Skip CPE items where the CPE URI doesn't meet our criteria.
parsedName, err := wfn.Parse(cpeItem.Name)
if err != nil {
log.Printf("unable to parse CPE URI %q: %s", cpeItem.Name, err)
}

if slices.Contains([]string{"h", "o"}, parsedName.Part) {
continue
}

normalizedName := normalizeCPE(parsedName).BindToURI()
if _, ok := seen[normalizedName]; ok {
continue
}
seen[normalizedName] = struct{}{}
cpeItem.Name = normalizedName

parsedCPE, err := wfn.Parse(cpeItem.Cpe23Item.Name)
if err != nil {
log.Printf("unable to parse CPE value %q: %s", cpeItem.Cpe23Item.Name, err)
}

cpeItem.Cpe23Item.Name = normalizeCPE(parsedCPE).BindToFmtString()

processedCpeList.CpeItems = append(processedCpeList.CpeItems, cpeItem)
}

return processedCpeList
}

// normalizeCPE removes the version and update parts of a CPE.
func normalizeCPE(cpe *wfn.Attributes) *wfn.Attributes {
cpeCopy := *cpe

cpeCopy.Version = ""
cpeCopy.Update = ""

return &cpeCopy
}

const (
prefixForNPMPackages = "https://www.npmjs.com/package/"
prefixForRubyGems = "https://rubygems.org/gems/"
prefixForRubyGemsHTTP = "http://rubygems.org/gems/"
prefixForNativeRubyGems = "https://github.com/ruby/"
prefixForPyPIPackages = "https://pypi.org/project/"
prefixForJenkinsPlugins = "https://github.com/jenkinsci/"
prefixForRustCrates = "https://crates.io/crates/"
)

// indexCPEList creates an index of CPEs by ecosystem.
func indexCPEList(list CpeList) *dictionary.Indexed {
indexed := &dictionary.Indexed{
EcosystemPackages: make(map[string]dictionary.Packages),
}

for _, cpeItem := range list.CpeItems {
cpeItemName := cpeItem.Cpe23Item.Name

for _, reference := range cpeItem.References {
ref := reference.Reference.Href

switch {
case strings.HasPrefix(ref, prefixForNPMPackages):
addEntryForNPMPackage(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForRubyGems), strings.HasPrefix(ref, prefixForRubyGemsHTTP):
addEntryForRubyGem(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForNativeRubyGems):
addEntryForNativeRubyGem(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForPyPIPackages):
addEntryForPyPIPackage(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForJenkinsPlugins):
// It _might_ be a jenkins plugin!
addEntryForJenkinsPlugin(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForRustCrates):
addEntryForRustCrate(indexed, ref, cpeItemName)
}
}
}

return indexed
}

func addEntryForRustCrate(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForRustCrates)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRustCrates]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRustCrates] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRustCrates][ref] = cpeItemName
}

func addEntryForJenkinsPlugin(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForJenkinsPlugins)
ref = strings.Split(ref, "/")[0]

if !strings.HasSuffix(ref, "-plugin") {
// It's not a jenkins plugin!
return
}

ref = strings.TrimSuffix(ref, "-plugin")

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][ref] = cpeItemName
}

func addEntryForPyPIPackage(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForPyPIPackages)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemPyPI]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemPyPI] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemPyPI][ref] = cpeItemName
}

func addEntryForNativeRubyGem(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForNativeRubyGems)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRubyGems]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRubyGems] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRubyGems][ref] = cpeItemName
}

func addEntryForRubyGem(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForRubyGems)
ref = strings.TrimPrefix(ref, prefixForRubyGemsHTTP)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRubyGems]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRubyGems] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRubyGems][ref] = cpeItemName
}

func addEntryForNPMPackage(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.Split(ref, "/v/")[0]
ref = strings.Split(ref, "?")[0]
ref = strings.TrimPrefix(ref, prefixForNPMPackages)

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemNPM]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemNPM] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemNPM][ref] = cpeItemName
}
Loading

0 comments on commit abca89c

Please sign in to comment.