Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rpm: add function to determine if paths are RPM #1473

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions file.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ type FileKind string

const (
FileKindWhiteout = FileKind("whiteout")
FileKindRPM = FileKind("rpm")
)

// File represents interesting files that are found in the layer.
Expand Down
16 changes: 15 additions & 1 deletion gobin/gobin.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (

"github.com/quay/claircore"
"github.com/quay/claircore/indexer"
"github.com/quay/claircore/rpm"
)

// Detector detects go binaries and reports the packages used to build them.
Expand Down Expand Up @@ -86,7 +87,8 @@ func (Detector) Scan(ctx context.Context, l *claircore.Layer) ([]*claircore.Pack
// Only create a single spool file per call, re-use for every binary.
var spool spoolfile
walk := func(p string, d fs.DirEntry, err error) error {
ctx := zlog.ContextWithValues(ctx, "path", d.Name())
ctx := zlog.ContextWithValues(ctx, "filename", d.Name())

switch {
case err != nil:
return err
Expand All @@ -107,6 +109,18 @@ func (Detector) Scan(ctx context.Context, l *claircore.Layer) ([]*claircore.Pack
// Not executable
return nil
}

isRPM, err := rpm.FileInstalledByRPM(ctx, l, p)
if err != nil {
return err
}
if isRPM {
zlog.Debug(ctx).
Str("path", p).
Msg("file path determined to be of RPM origin")
return nil
}

f, err := sys.Open(p)
if err != nil {
// TODO(crozzy): Remove log line once controller is in a
Expand Down
11 changes: 11 additions & 0 deletions gobin/gobin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ func TestEmptyFile(t *testing.T) {
if err := l.Init(ctx, &test.AnyDescription, f); err != nil {
t.Error(err)
}
t.Cleanup(func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

surprised these didn't panic before?

if err := l.Close(); err != nil {
t.Error(err)
}
})

var s Detector
_, err = s.Scan(ctx, &l)
if err != nil {
Expand Down Expand Up @@ -138,6 +144,11 @@ func TestScanner(t *testing.T) {
if err := l.Init(ctx, &test.AnyDescription, f); err != nil {
t.Error(err)
}
t.Cleanup(func() {
if err := l.Close(); err != nil {
t.Error(err)
}
})

// Run the scanner on the fake layer.
var s Detector
Expand Down
12 changes: 12 additions & 0 deletions nodejs/packagescanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/Masterminds/semver"
"github.com/quay/claircore"
"github.com/quay/claircore/indexer"
"github.com/quay/claircore/rpm"
)

const repository = "npm"
Expand Down Expand Up @@ -92,6 +93,17 @@ func (s *Scanner) Scan(ctx context.Context, layer *claircore.Layer) ([]*claircor
ret := make([]*claircore.Package, 0, len(pkgs))
var invalidPkgs []string
for _, p := range pkgs {
isRPM, err := rpm.FileInstalledByRPM(ctx, layer, p)
if err != nil {
return nil, err
}
if isRPM {
zlog.Debug(ctx).
Str("path", p).
Msg("file path determined to be of RPM origin")
continue
}

f, err := sys.Open(p)
if err != nil {
return nil, fmt.Errorf("nodejs: unable to open file %q: %w", p, err)
Expand Down
19 changes: 15 additions & 4 deletions python/packagescanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/quay/claircore"
"github.com/quay/claircore/indexer"
"github.com/quay/claircore/pkg/pep440"
"github.com/quay/claircore/rpm"
)

var (
Expand Down Expand Up @@ -79,6 +80,16 @@ func (ps *Scanner) Scan(ctx context.Context, layer *claircore.Layer) ([]*clairco
}
var ret []*claircore.Package
for _, n := range ms {
isRPM, err := rpm.FileInstalledByRPM(ctx, layer, n)
if err != nil {
return nil, err
}
if isRPM {
zlog.Debug(ctx).
Str("path", n).
Msg("file path determined to be of RPM origin")
continue
}
b, err := fs.ReadFile(sys, n)
if err != nil {
return nil, fmt.Errorf("python: unable to read file: %w", err)
Expand Down Expand Up @@ -143,14 +154,14 @@ func findDeliciousEgg(ctx context.Context, sys fs.FS) (out []string, err error)
// Is this layer an rpm layer?
//
// If so, files in the disto-managed directory can be skipped.
var rpm bool
var isRPM bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this still needed since there is already another check for rpm origin?

for _, p := range []string{
"var/lib/rpm/Packages",
"var/lib/rpm/rpmdb.sqlite",
"var/lib/rpm/Packages.db",
} {
if fi, err := fs.Stat(sys, p); err == nil && fi.Mode().IsRegular() {
rpm = true
isRPM = true
break
}
}
Expand All @@ -172,12 +183,12 @@ func findDeliciousEgg(ctx context.Context, sys fs.FS) (out []string, err error)
switch {
case err != nil:
return err
case (rpm || dpkg) && d.Type().IsDir():
case (isRPM || dpkg) && d.Type().IsDir():
// Skip one level up from the "packages" directory so the walk also
// skips the standard library.
var pat string
switch {
case rpm:
case isRPM:
pat = `usr/lib*/python[23].*`
ev = ev.Bool("rpm_dir", true)
case dpkg:
Expand Down
103 changes: 103 additions & 0 deletions rpm/files.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package rpm

import (
"context"
"fmt"
"io/fs"
"sync"

"github.com/quay/claircore"
"github.com/quay/zlog"
)

// filesCache is used for concurrent access to the map containing layer.Hash -> map RPM files.
// The value is a map to allow for quick member checking.
type filesCache struct {
c map[string]map[string]struct{}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's tempting to use a sync.Map here but I think we want the lock to extend over outer and inner map

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if we should take some inspiration from singleflight here. Lock per key (layer) and then cleanup once nobody needs it anymore (perhaps similar to the gc we have now. track all the contexts and then cleanup once all known, related contexts are closed).

Not sure if that's work, but if so, I think it'll allow for more concurrency

mu *sync.Mutex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious why not just do mu sync.Mutex? The zero value is usable

}

var fc *filesCache

func init() {
fc = &filesCache{
c: map[string]map[string]struct{}{},
mu: &sync.Mutex{},
}
}
Comment on lines +20 to +27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
var fc *filesCache
func init() {
fc = &filesCache{
c: map[string]map[string]struct{}{},
mu: &sync.Mutex{},
}
}
var fc = &filesCache{
c: map[string]map[string]struct{}{},
mu: &sync.Mutex{},
}


// gc deletes the layer's entry from the map if the ctx is done, this ties the lifecycle of
// the cached information to the request lifecycle to avoid excessive memory consumption.
func (fc *filesCache) gc(ctx context.Context, key string) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it assumed each language scanner's context is the same? If not, is it possible we cache a layer, delete it, then have to re-read it because it was just deleted?

<-ctx.Done()
fc.mu.Lock()
defer fc.mu.Unlock()
delete(fc.c, key)
}

// getFiles looks up RPM files that exist in the RPM database using the filesFromDB
// function and memorizes the result to avoid repeated work for the same claircore.Layer.
func (fc *filesCache) getFiles(ctx context.Context, layer *claircore.Layer) (map[string]struct{}, error) {
if fc == nil {
panic("programmer error: filesCache nil")
}
fc.mu.Lock()
defer fc.mu.Unlock()
if files, ok := fc.c[layer.Hash.String()]; ok {
return files, nil
}

sys, err := layer.FS()
if err != nil {
return nil, fmt.Errorf("rpm: unable to open layer: %w", err)
}

files := map[string]struct{}{}
defer func() {
// Defer setting the cache so any early-outs don't have to worry.
fc.c[layer.Hash.String()] = files
}()
Comment on lines +56 to +59
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems to assume any encountered errors are permanent and not just temporary. Is that the intention?

go func(ctx context.Context) {
fc.gc(ctx, layer.Hash.String())
}(ctx)

found := make([]foundDB, 0)
if err := fs.WalkDir(sys, ".", findDBs(ctx, &found, sys)); err != nil {
return nil, fmt.Errorf("rpm: error walking fs: %w", err)
}
if len(found) == 0 {
return nil, nil
}

done := map[string]struct{}{}
zlog.Debug(ctx).Int("count", len(found)).Msg("found possible databases")
for _, db := range found {
ctx := zlog.ContextWithValues(ctx, "db", db.String())
zlog.Debug(ctx).Msg("examining database")
if _, ok := done[db.Path]; ok {
zlog.Debug(ctx).Msg("already seen, skipping")
continue
}
done[db.Path] = struct{}{}
fs, err := getDBObjects(ctx, sys, db, filesFromDB)
if err != nil {
return nil, fmt.Errorf("rpm: error getting native DBs: %w", err)
}
for _, f := range fs {
files[f.Path] = struct{}{}
}
}

return files, nil
}

// FileInstalledByRPM takes a claircore.Layer and filepath string and returns a boolean
// signifying whether that file came from an RPM package.
func FileInstalledByRPM(ctx context.Context, layer *claircore.Layer, filepath string) (bool, error) {
files, err := fc.getFiles(ctx, layer)
if err != nil {
return false, err
}
_, exists := files[filepath]
return exists, nil
}
78 changes: 78 additions & 0 deletions rpm/files_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package rpm

import (
"context"
"testing"

"github.com/quay/claircore"
"github.com/quay/claircore/test"
"github.com/quay/zlog"
)

var rpmFilesTestcases = []struct {
name string
isRPM bool
filePath string
layer test.LayerRef
lenFiles int
}{
{
name: "Found",
isRPM: true,
filePath: "usr/lib/node_modules/npm/node_modules/safe-buffer/package.json",
layer: test.LayerRef{
Registry: "registry.access.redhat.com",
Name: "ubi9/nodejs-18",
Digest: `sha256:1ae06b64755052cef4c32979aded82a18f664c66fa7b50a6d2924afac2849c6e`,
},
lenFiles: 100,
},
{
name: "Not found",
isRPM: false,
filePath: "usr/lib/node_modules/npm/node_modules/safe-buffer/package.jsonx",
layer: test.LayerRef{
Registry: "registry.access.redhat.com",
Name: "ubi9/nodejs-18",
Digest: `sha256:1ae06b64755052cef4c32979aded82a18f664c66fa7b50a6d2924afac2849c6e`,
},
lenFiles: 100,
},
}

func TestIsRPMFile(t *testing.T) {
ctx, cancel := context.WithCancel(zlog.Test(context.Background(), t))
a := test.NewCachedArena(t)

for _, tt := range rpmFilesTestcases {
t.Run(tt.name, func(t *testing.T) {
a.LoadLayerFromRegistry(ctx, t, tt.layer)
r := a.Realizer(ctx).(*test.CachedRealizer)
t.Cleanup(func() {
if err := r.Close(); err != nil {
t.Error(err)
}
})

realizedLayers, err := r.RealizeDescriptions(ctx, []claircore.LayerDescription{
{
Digest: tt.layer.Digest,
URI: "http://example.com",
MediaType: test.MediaType,
Headers: make(map[string][]string),
},
})
if err != nil {
t.Fatal(err)
}
isRPM, err := FileInstalledByRPM(ctx, &realizedLayers[0], tt.filePath)
if err != nil {
t.Fatal(err)
}
if tt.isRPM != isRPM {
t.Errorf("expected isRPM: %t, got isRPM: %t", tt.isRPM, isRPM)
}
})
}
cancel()
}
Loading