Skip to content

Commit

Permalink
Introduce indexed embedded CPE dictionary
Browse files Browse the repository at this point in the history
Signed-off-by: Dan Luhring <[email protected]>
  • Loading branch information
luhring committed Jun 27, 2023
1 parent 0d4f190 commit 8fae3cb
Show file tree
Hide file tree
Showing 12 changed files with 25,335 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,4 @@ bin/
# attestation
cosign.key
cosign.pub

15 changes: 12 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,15 @@ generate-license-list: ## Generate an updated spdx license list

## Build-related targets #################################

.PHONY: cpe-index
cpe-index: ## Build the CPE index
$(call title,Building CPE index)
go run ./syft/pkg/cataloger/common/cpe/dictionary/index-generator -o ./syft/pkg/cataloger/common/cpe/cpe-index.json

.PHONY: build
build: $(SNAPSHOT_DIR) ## Build release snapshot binaries and packages

$(SNAPSHOT_DIR): ## Build snapshot release binaries and packages
$(SNAPSHOT_DIR): cpe-index ## Build snapshot release binaries and packages
$(call title,Building snapshot artifacts)

# create a config with the dist dir overridden
Expand All @@ -338,7 +343,7 @@ release:
@.github/scripts/trigger-release.sh

.PHONY: ci-release
ci-release: ci-check clean-dist $(CHANGELOG)
ci-release: ci-check clean-dist $(CHANGELOG) cpe-index
$(call title,Publishing release artifacts)

# create a config with the dist dir overridden
Expand All @@ -361,9 +366,13 @@ ci-check:
## Cleanup targets #################################

.PHONY: clean
clean: clean-dist clean-snapshot clean-test-image-cache ## Remove previous builds, result reports, and test cache
clean: clean-dist clean-snapshot clean-test-image-cache clean-cpe-index ## Remove previous builds, result reports, and test cache
$(call safe_rm_rf_children,$(TEMP_DIR))

.PHONY: clean-cpe-index
clean-cpe-index: ## Remove the CPE index
$(call safe_rm_rf,./syft/pkg/cataloger/common/cpe/cpe-index.json)

.PHONY: clean-snapshot
clean-snapshot:
$(call safe_rm_rf,$(SNAPSHOT_DIR))
Expand Down
9 changes: 8 additions & 1 deletion syft/pkg/cataloger/catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,14 @@ func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerRes
for _, p := range packages {
// generate CPEs (note: this is excluded from package ID, so is safe to mutate)
// we might have binary classified CPE already with the package so we want to append here
p.CPEs = append(p.CPEs, cpe.Generate(p)...)

dictionaryCPE, ok := cpe.DictionaryFind(p)
if ok {
log.Debugf("used CPE dictionary to find CPE for %s package %q: %s", p.Type, p.Name, dictionaryCPE.BindToFmtString())
p.CPEs = append(p.CPEs, dictionaryCPE)
} else {
p.CPEs = append(p.CPEs, cpe.Generate(p)...)
}

// if we were not able to identify the language we have an opportunity
// to try and get this value from the PURL. Worst case we assert that
Expand Down
1 change: 1 addition & 0 deletions syft/pkg/cataloger/common/cpe/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cpe-index.json
227 changes: 227 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/index-generator/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
package main

import (
"compress/gzip"
"encoding/json"
"encoding/xml"
"fmt"
"io"
"log"
"strings"

"github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary"
"github.com/facebookincubator/nvdtools/wfn"
"golang.org/x/exp/slices"
)

func generateIndexedDictionaryJSON(rawGzipData io.Reader) ([]byte, error) {
gzipReader, err := gzip.NewReader(rawGzipData)
if err != nil {
return nil, fmt.Errorf("unable to decompress CPE dictionary: %w", err)
}
defer gzipReader.Close()

// Read XML data
data, err := io.ReadAll(gzipReader)
if err != nil {
return nil, fmt.Errorf("unable to read CPE dictionary: %w", err)
}

// Unmarshal XML
var cpeList CpeList
if err := xml.Unmarshal(data, &cpeList); err != nil {
return nil, fmt.Errorf("unable to unmarshal CPE dictionary XML: %w", err)
}

// Filter out data that's not applicable here
cpeList = filterCpeList(cpeList)

// Create indexed dictionary to help with looking up CPEs
indexedDictionary := indexCPEList(cpeList)

// Convert to JSON
jsonData, err := json.Marshal(indexedDictionary)
if err != nil {
return nil, fmt.Errorf("unable to marshal CPE dictionary to JSON: %w", err)
}
return jsonData, nil
}

// filterCpeList removes CPE items that are not applicable to software packages.
func filterCpeList(cpeList CpeList) CpeList {
var processedCpeList CpeList

seen := make(map[string]struct{})

for _, cpeItem := range cpeList.CpeItems {
// Skip CPE items that don't have any references.
if len(cpeItem.References) == 0 {
continue
}

// Skip CPE items where the CPE URI doesn't meet our criteria.
parsedName, err := wfn.Parse(cpeItem.Name)
if err != nil {
log.Printf("unable to parse CPE URI %q: %s", cpeItem.Name, err)
}

if slices.Contains([]string{"h", "o"}, parsedName.Part) {
continue
}

normalizedName := normalizeCPE(parsedName).BindToURI()
if _, ok := seen[normalizedName]; ok {
continue
}
seen[normalizedName] = struct{}{}
cpeItem.Name = normalizedName

parsedCPE, err := wfn.Parse(cpeItem.Cpe23Item.Name)
if err != nil {
log.Printf("unable to parse CPE value %q: %s", cpeItem.Cpe23Item.Name, err)
}

cpeItem.Cpe23Item.Name = normalizeCPE(parsedCPE).BindToFmtString()

processedCpeList.CpeItems = append(processedCpeList.CpeItems, cpeItem)
}

return processedCpeList
}

// normalizeCPE removes the version and update parts of a CPE.
func normalizeCPE(cpe *wfn.Attributes) *wfn.Attributes {
cpeCopy := *cpe

cpeCopy.Version = ""
cpeCopy.Update = ""

return &cpeCopy
}

const (
prefixForNPMPackages = "https://www.npmjs.com/package/"
prefixForRubyGems = "https://rubygems.org/gems/"
prefixForRubyGemsHTTP = "http://rubygems.org/gems/"
prefixForNativeRubyGems = "https://github.com/ruby/"
prefixForPyPIPackages = "https://pypi.org/project/"
prefixForJenkinsPlugins = "https://github.com/jenkinsci/"
prefixForRustCrates = "https://crates.io/crates/"
)

// indexCPEList creates an index of CPEs by ecosystem.
func indexCPEList(list CpeList) *dictionary.Indexed {
indexed := &dictionary.Indexed{
EcosystemPackages: make(map[string]dictionary.Packages),
}

for _, cpeItem := range list.CpeItems {
for _, reference := range cpeItem.References {
ref := reference.Reference.Href

switch {
case strings.HasPrefix(ref, prefixForNPMPackages):
addEntryForNPMPackage(indexed, ref, cpeItem)

case strings.HasPrefix(ref, prefixForRubyGems), strings.HasPrefix(ref, prefixForRubyGemsHTTP):
addEntryForRubyGem(indexed, ref, cpeItem)

case strings.HasPrefix(ref, prefixForNativeRubyGems):
addEntryForNativeRubyGem(indexed, ref, cpeItem)

case strings.HasPrefix(ref, prefixForPyPIPackages):
addEntryForPyPIPackage(indexed, ref, cpeItem)

case strings.HasPrefix(ref, prefixForJenkinsPlugins):
// It _might_ be a jenkins plugin!
addEntryForJenkinsPlugin(indexed, ref, cpeItem)

case strings.HasPrefix(ref, prefixForRustCrates):
addEntryForRustCrate(indexed, ref, cpeItem)
}
}
}

return indexed
}

func addEntryForRustCrate(indexed *dictionary.Indexed, ref string, cpeItem CpeItem) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForRustCrates)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRustCrates]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRustCrates] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRustCrates][ref] = cpeItem.Cpe23Item.Name
}

func addEntryForJenkinsPlugin(indexed *dictionary.Indexed, ref string, cpeItem CpeItem) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForJenkinsPlugins)
ref = strings.Split(ref, "/")[0]

if !strings.HasSuffix(ref, "-plugin") {
// It's not a jenkins plugin!
return
}

ref = strings.TrimSuffix(ref, "-plugin")

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][ref] = cpeItem.Cpe23Item.Name
}

func addEntryForPyPIPackage(indexed *dictionary.Indexed, ref string, cpeItem CpeItem) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForPyPIPackages)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemPyPI]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemPyPI] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemPyPI][ref] = cpeItem.Cpe23Item.Name
}

func addEntryForNativeRubyGem(indexed *dictionary.Indexed, ref string, cpeItem CpeItem) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForNativeRubyGems)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRubyGems]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRubyGems] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRubyGems][ref] = cpeItem.Cpe23Item.Name
}

func addEntryForRubyGem(indexed *dictionary.Indexed, ref string, cpeItem CpeItem) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForRubyGems)
ref = strings.TrimPrefix(ref, prefixForRubyGemsHTTP)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRubyGems]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRubyGems] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRubyGems][ref] = cpeItem.Cpe23Item.Name
}

func addEntryForNPMPackage(indexed *dictionary.Indexed, ref string, cpeItem CpeItem) {
// Prune off the non-package-name parts of the URL
ref = strings.Split(ref, "/v/")[0]
ref = strings.Split(ref, "?")[0]
ref = strings.TrimPrefix(ref, prefixForNPMPackages)

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemNPM]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemNPM] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemNPM][ref] = cpeItem.Cpe23Item.Name
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package main

import (
"bytes"
"compress/gzip"
"io"
"os"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func Test_generateIndexedDictionaryJSON(t *testing.T) {
f, err := os.Open("testdata/official-cpe-dictionary_v2.3.xml")
require.NoError(t, err)

// Create a buffer to store the gzipped data in memory
buf := new(bytes.Buffer)

w := gzip.NewWriter(buf)
_, err = io.Copy(w, f)
require.NoError(t, err)

// (finalize the gzip stream)
err = w.Close()
require.NoError(t, err)

dictionaryJSON, err := generateIndexedDictionaryJSON(buf)
assert.NoError(t, err)

expected, err := os.ReadFile("./testdata/expected-cpe-index.json")
require.NoError(t, err)

expectedDictionaryJSONString := string(expected)
dictionaryJSONString := string(dictionaryJSON)

if diff := cmp.Diff(expectedDictionaryJSONString, dictionaryJSONString); diff != "" {
t.Errorf("generateIndexedDictionaryJSON() mismatch (-want +got):\n%s", diff)
}
}
55 changes: 55 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/index-generator/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// This program downloads the latest CPE dictionary from NIST and processes it into a JSON file that can be embedded into Syft for more accurate CPE results.
package main

import (
"errors"
"flag"
"fmt"
"log"
"net/http"
"os"
)

func mainE() error {
var outputFilename string
flag.StringVar(&outputFilename, "o", "", "file location to save CPE index")
flag.Parse()

if outputFilename == "" {
return errors.New("-o is required")
}

// Download and decompress file
resp, err := http.Get(cpeDictionaryURL)
if err != nil {
return fmt.Errorf("unable to get CPE dictionary: %w", err)
}
defer resp.Body.Close()

dictionaryJSON, err := generateIndexedDictionaryJSON(resp.Body)
if err != nil {
return err
}

// Write CPE index (JSON data) to disk
err = os.WriteFile(outputFilename, dictionaryJSON, 0600)
if err != nil {
return fmt.Errorf("unable to write processed CPE dictionary to file: %w", err)
}

fmt.Println("Done!")

return nil
}

// errExit prints an error and exits with a non-zero exit code.
func errExit(err error) {
log.Printf("command failed: %s", err)
os.Exit(1)
}

func main() {
if err := mainE(); err != nil {
errExit(err)
}
}
Loading

0 comments on commit 8fae3cb

Please sign in to comment.