From ca0cc52d47b103642c4b8516cf7a09a7a2671656 Mon Sep 17 00:00:00 2001 From: Keith Zantow Date: Wed, 12 Jun 2024 19:12:35 -0400 Subject: [PATCH] fix: separate golang license caches from mod dir (#2852) Signed-off-by: Keith Zantow --- cmd/syft/internal/commands/attest.go | 2 + cmd/syft/internal/commands/scan.go | 2 + cmd/syft/internal/options/cache.go | 122 ++++++++ cmd/syft/internal/options/cache_test.go | 184 +++++++++++ go.mod | 6 +- internal/cache/README.md | 51 +++ internal/cache/bypass.go | 24 ++ internal/cache/bypass_test.go | 18 ++ internal/cache/cache.go | 48 +++ internal/cache/cache_test.go | 21 ++ internal/cache/error_resolver.go | 40 +++ internal/cache/error_resolver_test.go | 47 +++ internal/cache/filesystem.go | 120 +++++++ internal/cache/filesystem_test.go | 94 ++++++ internal/cache/hash_type.go | 71 +++++ internal/cache/hash_type_test.go | 123 ++++++++ internal/cache/memory.go | 16 + internal/cache/memory_test.go | 37 +++ internal/cache/resolver.go | 83 +++++ internal/cache/resolver_test.go | 92 ++++++ syft/pkg/cataloger/golang/cataloger.go | 6 +- syft/pkg/cataloger/golang/config.go | 28 +- syft/pkg/cataloger/golang/config_test.go | 3 +- syft/pkg/cataloger/golang/licenses.go | 293 ++++++++++++------ syft/pkg/cataloger/golang/licenses_test.go | 32 +- syft/pkg/cataloger/golang/package.go | 2 +- syft/pkg/cataloger/golang/parse_go_binary.go | 4 +- syft/pkg/cataloger/golang/parse_go_mod.go | 12 +- .../pkg/cataloger/golang/parse_go_mod_test.go | 4 +- syft/source/directorysource/cache_excludes.go | 33 ++ .../directorysource/directory_source.go | 3 + 31 files changed, 1484 insertions(+), 137 deletions(-) create mode 100644 cmd/syft/internal/options/cache.go create mode 100644 cmd/syft/internal/options/cache_test.go create mode 100644 internal/cache/README.md create mode 100644 internal/cache/bypass.go create mode 100644 internal/cache/bypass_test.go create mode 100644 internal/cache/cache.go create mode 100644 internal/cache/cache_test.go create mode 100644 internal/cache/error_resolver.go create mode 100644 internal/cache/error_resolver_test.go create mode 100644 internal/cache/filesystem.go create mode 100644 internal/cache/filesystem_test.go create mode 100644 internal/cache/hash_type.go create mode 100644 internal/cache/hash_type_test.go create mode 100644 internal/cache/memory.go create mode 100644 internal/cache/memory_test.go create mode 100644 internal/cache/resolver.go create mode 100644 internal/cache/resolver_test.go create mode 100644 syft/source/directorysource/cache_excludes.go diff --git a/cmd/syft/internal/commands/attest.go b/cmd/syft/internal/commands/attest.go index 8e01cbc6823..374945eff3f 100644 --- a/cmd/syft/internal/commands/attest.go +++ b/cmd/syft/internal/commands/attest.go @@ -43,6 +43,7 @@ type attestOptions struct { options.UpdateCheck `yaml:",inline" mapstructure:",squash"` options.Catalog `yaml:",inline" mapstructure:",squash"` Attest options.Attest `yaml:"attest" mapstructure:"attest"` + Cache options.Cache `json:"-" yaml:"cache" mapstructure:"cache"` } func Attest(app clio.Application) *cobra.Command { @@ -77,6 +78,7 @@ func defaultAttestOptions() attestOptions { Output: defaultAttestOutputOptions(), UpdateCheck: options.DefaultUpdateCheck(), Catalog: options.DefaultCatalog(), + Cache: options.DefaultCache(), } } diff --git a/cmd/syft/internal/commands/scan.go b/cmd/syft/internal/commands/scan.go index 31f3277efb2..ff5b4254f65 100644 --- a/cmd/syft/internal/commands/scan.go +++ b/cmd/syft/internal/commands/scan.go @@ -68,6 +68,7 @@ type scanOptions struct { options.Output `yaml:",inline" mapstructure:",squash"` options.UpdateCheck `yaml:",inline" mapstructure:",squash"` options.Catalog `yaml:",inline" mapstructure:",squash"` + Cache options.Cache `json:"-" yaml:"cache" mapstructure:"cache"` } func defaultScanOptions() *scanOptions { @@ -75,6 +76,7 @@ func defaultScanOptions() *scanOptions { Output: options.DefaultOutput(), UpdateCheck: options.DefaultUpdateCheck(), Catalog: options.DefaultCatalog(), + Cache: options.DefaultCache(), } } diff --git a/cmd/syft/internal/options/cache.go b/cmd/syft/internal/options/cache.go new file mode 100644 index 00000000000..d8e4130a263 --- /dev/null +++ b/cmd/syft/internal/options/cache.go @@ -0,0 +1,122 @@ +package options + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" + + "github.com/adrg/xdg" + "github.com/mitchellh/go-homedir" + + "github.com/anchore/clio" + "github.com/anchore/syft/internal/cache" + "github.com/anchore/syft/internal/log" +) + +// Cache provides configuration for the Syft caching behavior +type Cache struct { + Dir string `yaml:"dir" mapstructure:"dir"` + TTL string `yaml:"ttl" mapstructure:"ttl"` +} + +func (c *Cache) DescribeFields(descriptions clio.FieldDescriptionSet) { + descriptions.Add(&c.Dir, "root directory to cache any downloaded content") + descriptions.Add(&c.TTL, "time to live for cached data") +} + +func (c *Cache) PostLoad() error { + if c.Dir != "" { + ttl, err := parseDuration(c.TTL) + if err != nil { + log.Warnf("unable to parse duration '%v', using default (%s) due to: %v", c.TTL, durationToString(defaultTTL()), err) + ttl = defaultTTL() + } + dir, err := homedir.Expand(c.Dir) + if err != nil { + log.Warnf("unable to expand cache directory %s: %v", c.Dir, err) + cache.SetManager(cache.NewInMemory(ttl)) + } else { + m, err := cache.NewFromDir(dir, ttl) + if err != nil { + log.Warnf("unable to get filesystem cache at %s: %v", c.Dir, err) + cache.SetManager(cache.NewInMemory(ttl)) + } else { + cache.SetManager(m) + } + } + } + return nil +} + +var _ interface { + clio.PostLoader + clio.FieldDescriber +} = (*Cache)(nil) + +func DefaultCache() Cache { + return Cache{ + Dir: defaultDir(), + TTL: durationToString(defaultTTL()), + } +} + +func defaultTTL() time.Duration { + return 7 * 24 * time.Hour +} + +func defaultDir() string { + var err error + cacheRoot := xdg.CacheHome + if cacheRoot == "" { + cacheRoot, err = homedir.Dir() + if err != nil { + cacheRoot = os.TempDir() + log.Debugf("unable to get stable cache directory due to: %v, defaulting cache to temp dir: %s", err, cacheRoot) + } else { + cacheRoot = filepath.Join(cacheRoot, ".cache") + } + } + + return filepath.Join(cacheRoot, "syft") +} + +func durationToString(duration time.Duration) string { + days := int64(duration / (24 * time.Hour)) + remain := duration % (24 * time.Hour) + out := "" + if days > 0 { + out = fmt.Sprintf("%vd", days) + } + if remain != 0 { + out += remain.String() + } + if out == "" { + return "0" + } + return out +} + +var whitespace = regexp.MustCompile(`\s+`) + +func parseDuration(duration string) (time.Duration, error) { + duration = strings.ToLower(whitespace.ReplaceAllString(duration, "")) + parts := strings.SplitN(duration, "d", 2) + var days time.Duration + var remain time.Duration + var err error + if len(parts) > 1 { + numDays, daysErr := strconv.Atoi(parts[0]) + if daysErr != nil { + return 0, daysErr + } + days = time.Duration(numDays) * 24 * time.Hour + remain, err = time.ParseDuration(parts[1]) + } else { + remain, err = time.ParseDuration(duration) + } + return days + remain, err +} diff --git a/cmd/syft/internal/options/cache_test.go b/cmd/syft/internal/options/cache_test.go new file mode 100644 index 00000000000..e9bf92087c4 --- /dev/null +++ b/cmd/syft/internal/options/cache_test.go @@ -0,0 +1,184 @@ +package options + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/adrg/xdg" + "github.com/mitchellh/go-homedir" + "github.com/stretchr/testify/require" +) + +func Test_defaultDir(t *testing.T) { + tmpDir := filepath.Join(t.TempDir(), "cache-temp") + xdgCacheDir := filepath.Join(tmpDir, "fake-xdg-cache") + homeDir := filepath.Join(tmpDir, "fake-home") + + tests := []struct { + name string + env map[string]string + expected string + }{ + { + name: "no-xdg", + env: map[string]string{ + "HOME": homeDir, + }, + expected: homeDir, + }, + { + name: "xdg-cache", + env: map[string]string{ + "XDG_CACHE_HOME": xdgCacheDir, + }, + expected: xdgCacheDir, + }, + } + + // capture all the initial environment variables to reset them before we reset library caches + env := map[string]string{ + "HOME": "", + "XDG_DATA_HOME": "", + "XDG_DATA_DIRS": "", + "XDG_CONFIG_HOME": "", + "XDG_CONFIG_DIRS": "", + "XDG_STATE_HOME": "", + "XDG_CACHE_HOME": "", + "XDG_RUNTIME_DIR": "", + } + for k := range env { + env[k] = os.Getenv(k) + } + + unsetEnv := func(t *testing.T) { + for k := range env { + t.Setenv(k, "") + } + } + + resetEnv := func() { + for k, v := range env { + if v == "" { + _ = os.Unsetenv(k) + } else { + _ = os.Setenv(k, v) + } + } + homedir.Reset() + xdg.Reload() + } + + t.Cleanup(resetEnv) + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + defer resetEnv() + + unsetEnv(t) + for k, v := range test.env { + t.Setenv(k, v) + } + homedir.Reset() + xdg.Reload() + + got := defaultDir() + + require.True(t, strings.HasPrefix(got, test.expected)) + }) + } +} + +func Test_parseDuration(t *testing.T) { + tests := []struct { + duration string + expect time.Duration + err require.ErrorAssertionFunc + }{ + { + duration: "1d", + expect: 24 * time.Hour, + }, + { + duration: "7d", + expect: 7 * 24 * time.Hour, + }, + { + duration: "365D", + expect: 365 * 24 * time.Hour, + }, + { + duration: "7d1h1m1s", + expect: 7*24*time.Hour + time.Hour + time.Minute + time.Second, + }, + { + duration: "7d 1h 1m 1s", + expect: 7*24*time.Hour + time.Hour + time.Minute + time.Second, + }, + { + duration: "2h", + expect: 2 * time.Hour, + }, + { + duration: "2h5m", + expect: 2*time.Hour + 5*time.Minute, + }, + { + duration: "2h 5m", + expect: 2*time.Hour + 5*time.Minute, + }, + { + duration: "d24h", + err: require.Error, + }, + } + + for _, test := range tests { + t.Run(test.duration, func(t *testing.T) { + got, err := parseDuration(test.duration) + if test.err != nil { + test.err(t, err) + return + } + require.Equal(t, test.expect, got) + }) + } +} + +func Test_durationToString(t *testing.T) { + tests := []struct { + duration time.Duration + expect string + err require.ErrorAssertionFunc + }{ + { + expect: "1d", + duration: 24 * time.Hour, + }, + { + expect: "7d", + duration: 7 * 24 * time.Hour, + }, + { + expect: "7d1h1m1s", + duration: 7*24*time.Hour + time.Hour + time.Minute + time.Second, + }, + { + expect: "2h0m0s", + duration: 2 * time.Hour, + }, + { + expect: "2h5m0s", + duration: 2*time.Hour + 5*time.Minute, + }, + } + + for _, test := range tests { + t.Run(test.expect, func(t *testing.T) { + got := durationToString(test.duration) + require.Equal(t, test.expect, got) + }) + } +} diff --git a/go.mod b/go.mod index 4aacbc6287f..3c345cef00c 100644 --- a/go.mod +++ b/go.mod @@ -86,7 +86,10 @@ require ( require google.golang.org/genproto v0.0.0-20231106174013-bbf56f31fb17 // indirect -require github.com/magiconair/properties v1.8.7 +require ( + github.com/adrg/xdg v0.4.0 + github.com/magiconair/properties v1.8.7 +) require ( dario.cat/mergo v1.0.0 // indirect @@ -98,7 +101,6 @@ require ( github.com/Microsoft/go-winio v0.6.1 // indirect github.com/Microsoft/hcsshim v0.11.4 // indirect github.com/ProtonMail/go-crypto v1.0.0 // indirect - github.com/adrg/xdg v0.4.0 // indirect github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092 // indirect github.com/andybalholm/brotli v1.0.4 // indirect github.com/aquasecurity/go-version v0.0.0-20210121072130-637058cfe492 // indirect diff --git a/internal/cache/README.md b/internal/cache/README.md new file mode 100644 index 00000000000..174e03b9e2c --- /dev/null +++ b/internal/cache/README.md @@ -0,0 +1,51 @@ +# Caching + +All caches are created from a global `manager`. By defaut this is a `bypassedCache`, which performs no caching. +One benefit of this is that tests don't need to worry about caching causing issues unless they explicitly need +to test the cache and can opt-in using the `cache.TestCache(t)` helper. + +Syft sets a `filesystemCache` when the [cache options](../../cmd/syft/internal/options/cache.go) are loaded. + +When using the `filesystemCache` all items are stored on disk under a root directory, generally in the form of: +``` +///path/to/data +``` + +# Using the cache + +The easiest and preferred method to use the cache is a `cache.Resolver`, which automatically creates a `` +based on the _structure_ of the provided type. +If the structure changes in any way it will end up with a new version key and all will re populate this new key, +ignoring cached values from older, different versions. +The resolver will store items using the `json` package to serialize/deserialize values, so to save space +it is encouraged to use `omitempty`. For example: + +```go +type myCacheItem struct { + Name string `json:"name",omitempty` +} +``` + +It is possible to use core types such as `pkg.Package` as long as they support the standard `json` serialization, +but this is discouraged in order to decouple changes to them from affecting the information stored in the cache. + +To get a cache for this type: +```go +resolver := cache.GetResolver[myCacheItem]("myCacheName", "v1") +``` + +Using the `resolver` is a single call, which manages checking for items in the cache, expiry times, +and if not found invoking the callback to populate the cache and return a value: +```go +data := resolver.Resolve("some/cache/key", func() (myCacheItem, error) { + // do things to return a myCacheItem or error +}) +``` + +If it is common that checking for an item will result in errors, and you do not want to re-run the resolve function +when errors are encountered, instead of using `GetResolver`, you can use `GetResolverCachingErrors`, which is useful +for things such as resolving artifacts over a network, where a number of them will not be resolved, and you do not want +to continue to have the expense of running the network resolution. This should be used when it is acceptable a network +outage and cached errors is an acceptable risk. + +An example can be seen in the [golang cataloger](../../syft/pkg/cataloger/golang/licenses.go) fetching remote licenses. diff --git a/internal/cache/bypass.go b/internal/cache/bypass.go new file mode 100644 index 00000000000..4d32f1a613d --- /dev/null +++ b/internal/cache/bypass.go @@ -0,0 +1,24 @@ +package cache + +import "io" + +type bypassedCache struct{} + +func (b *bypassedCache) Read(_ string) (ReaderAtCloser, error) { + return nil, errNotFound +} + +func (b *bypassedCache) Write(_ string, contents io.Reader) error { + if closer, ok := contents.(io.Closer); ok { + _ = closer.Close() + } + return nil +} + +func (b *bypassedCache) GetCache(_, _ string) Cache { + return b +} + +func (b *bypassedCache) RootDirs() []string { + return nil +} diff --git a/internal/cache/bypass_test.go b/internal/cache/bypass_test.go new file mode 100644 index 00000000000..073c0ef36a7 --- /dev/null +++ b/internal/cache/bypass_test.go @@ -0,0 +1,18 @@ +package cache + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func Test_bypassedCache(t *testing.T) { + m := bypassedCache{} + cache := m.GetCache("name", "version") + err := cache.Write("test", strings.NewReader("value")) + require.NoError(t, err) + rdr, err := cache.Read("test") + require.Nil(t, rdr) + require.ErrorIs(t, err, errNotFound) +} diff --git a/internal/cache/cache.go b/internal/cache/cache.go new file mode 100644 index 00000000000..02208504e97 --- /dev/null +++ b/internal/cache/cache.go @@ -0,0 +1,48 @@ +package cache + +import ( + "io" +) + +// Manager is responsible for managing cache data and instantiating all caches +type Manager interface { + // GetCache returns a cache scoped to the given named, versioned data + GetCache(name, version string) Cache + + // RootDirs returns any root directories this cache manager uses + RootDirs() []string +} + +// ReaderAtCloser is an amalgamation of: io.Reader, io.ReaderAt, and io.Closer +type ReaderAtCloser interface { + io.Reader + io.ReaderAt + io.Closer +} + +// Cache is what the application interacts with to get and set cached data +type Cache interface { + // Read returns a reader for the cache value, if found and not expired + // or errors when unable to find / expired + Read(key string) (ReaderAtCloser, error) + + // Write writes the contents of the reader to the cache + // and closes it, if the reader implements io.Closer + Write(key string, contents io.Reader) error +} + +// GetManager returns the global cache manager, which is used to instantiate all caches +func GetManager() Manager { + return manager +} + +// SetManager sets the global cache manager, which is used to instantiate all caches +func SetManager(m Manager) { + if m == nil { + manager = &bypassedCache{} + } else { + manager = m + } +} + +var manager Manager = &bypassedCache{} diff --git a/internal/cache/cache_test.go b/internal/cache/cache_test.go new file mode 100644 index 00000000000..548a4b03b7c --- /dev/null +++ b/internal/cache/cache_test.go @@ -0,0 +1,21 @@ +package cache + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func Test_SetManager(t *testing.T) { + original := GetManager() + defer SetManager(original) + + SetManager(NewInMemory(0)) + + require.NotNil(t, GetManager()) + require.IsType(t, &filesystemCache{}, GetManager()) + + SetManager(nil) + require.NotNil(t, GetManager()) + require.IsType(t, &bypassedCache{}, GetManager()) +} diff --git a/internal/cache/error_resolver.go b/internal/cache/error_resolver.go new file mode 100644 index 00000000000..614d707f1fa --- /dev/null +++ b/internal/cache/error_resolver.go @@ -0,0 +1,40 @@ +package cache + +import "fmt" + +// GetResolverCachingErrors returns a Resolver that caches errors and will return them +// instead of continuing to call the provided resolve functions +func GetResolverCachingErrors[T any](name, version string) Resolver[T] { + return &errorResolver[T]{ + resolver: GetResolver[errResponse[T]](name, version), + } +} + +type errResponse[T any] struct { + Error string `json:"err,omitempty"` + Value T `json:"val,omitempty"` +} + +type errorResolver[T any] struct { + resolver Resolver[errResponse[T]] +} + +func (r *errorResolver[T]) Resolve(key string, resolver resolverFunc[T]) (T, error) { + v, err := r.resolver.Resolve(key, func() (errResponse[T], error) { + v, err := resolver() + out := errResponse[T]{ + Value: v, + } + if err != nil { + out.Error = err.Error() + } + return out, nil + }) + if err != nil { + return v.Value, err + } + if v.Error != "" { + return v.Value, fmt.Errorf(v.Error) + } + return v.Value, nil +} diff --git a/internal/cache/error_resolver_test.go b/internal/cache/error_resolver_test.go new file mode 100644 index 00000000000..b7216a536b8 --- /dev/null +++ b/internal/cache/error_resolver_test.go @@ -0,0 +1,47 @@ +package cache + +import ( + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func Test_errorResolver(t *testing.T) { + original := GetManager() + defer SetManager(original) + SetManager(NewInMemory(time.Hour)) + + resolver := GetResolverCachingErrors[string]("theCache", "theVersion") + + resolveCount := 0 + resolveFunc := func() (string, error) { + resolveCount++ + return "theValue", nil + } + + val, err := resolver.Resolve("theKey", resolveFunc) + require.NoError(t, err) + require.Equal(t, 1, resolveCount) + require.Equal(t, "theValue", val) + + val, err = resolver.Resolve("theKey", resolveFunc) + require.NoError(t, err) + require.Equal(t, 1, resolveCount) + require.Equal(t, "theValue", val) + + errorCount := 0 + errorFunc := func() (string, error) { + errorCount++ + return "", fmt.Errorf("an error") + } + + _, err = resolver.Resolve("errorValue", errorFunc) + require.ErrorContains(t, err, "an error") + require.Equal(t, 1, errorCount) + + _, err = resolver.Resolve("errorValue", errorFunc) + require.ErrorContains(t, err, "an error") + require.Equal(t, 1, errorCount) +} diff --git a/internal/cache/filesystem.go b/internal/cache/filesystem.go new file mode 100644 index 00000000000..8b628cc50d2 --- /dev/null +++ b/internal/cache/filesystem.go @@ -0,0 +1,120 @@ +package cache + +import ( + "errors" + "fmt" + "io" + "net/url" + "os" + "path/filepath" + "regexp" + "strings" + "time" + + "github.com/spf13/afero" + + "github.com/anchore/syft/internal/log" +) + +// NewFromDir creates a new cache manager which returns caches stored on disk, rooted at the given directory +func NewFromDir(dir string, ttl time.Duration) (Manager, error) { + dir = filepath.Clean(dir) + fsys, err := subFs(afero.NewOsFs(), dir) + if err != nil { + return nil, err + } + return &filesystemCache{ + dir: dir, + fs: fsys, + ttl: ttl, + }, nil +} + +const filePermissions = 0700 +const dirPermissions = os.ModeDir | filePermissions + +type filesystemCache struct { + dir string + fs afero.Fs + ttl time.Duration +} + +func (d *filesystemCache) GetCache(name, version string) Cache { + fsys, err := subFs(d.fs, name, version) + if err != nil { + log.Warnf("error getting cache for: %s/%s: %v", name, version, err) + return &bypassedCache{} + } + return &filesystemCache{ + dir: filepath.Join(d.dir, name, version), + fs: fsys, + ttl: d.ttl, + } +} + +func (d *filesystemCache) RootDirs() []string { + if d.dir == "" { + return nil + } + return []string{d.dir} +} + +func (d *filesystemCache) Read(key string) (ReaderAtCloser, error) { + path := makeDiskKey(key) + f, err := d.fs.Open(path) + if err != nil { + log.Tracef("no cache entry for %s %s: %v", d.dir, key, err) + return nil, errNotFound + } else if stat, err := f.Stat(); err != nil || stat == nil || time.Since(stat.ModTime()) > d.ttl { + log.Tracef("cache entry is too old for %s %s", d.dir, key) + return nil, errExpired + } + log.Tracef("using cache for %s %s", d.dir, key) + return f, nil +} + +func (d *filesystemCache) Write(key string, contents io.Reader) error { + path := makeDiskKey(key) + return afero.WriteReader(d.fs, path, contents) +} + +// subFs returns a writable directory with the given name under the root cache directory returned from findRoot, +// the directory will be created if it does not exist +func subFs(fsys afero.Fs, subDirs ...string) (afero.Fs, error) { + dir := filepath.Join(subDirs...) + dir = filepath.Clean(dir) + stat, err := fsys.Stat(dir) + if errors.Is(err, afero.ErrFileNotFound) { + err = fsys.MkdirAll(dir, dirPermissions) + if err != nil { + return nil, fmt.Errorf("unable to create directory at '%s/%s': %v", dir, strings.Join(subDirs, "/"), err) + } + stat, err = fsys.Stat(dir) + if err != nil { + return nil, err + } + } + if err != nil || stat == nil || !stat.IsDir() { + return nil, fmt.Errorf("unable to verify directory '%s': %v", dir, err) + } + fsys = afero.NewBasePathFs(fsys, dir) + return fsys, err +} + +var keyReplacer = regexp.MustCompile("[^-._/a-zA-Z0-9]") + +// makeDiskKey makes a safe sub-path but not escape forward slashes, this allows for logical partitioning on disk +func makeDiskKey(key string) string { + // encode single dot directory + if key == "." { + return "%2E" + } + // replace any disallowed chars with encoded form + key = keyReplacer.ReplaceAllStringFunc(key, url.QueryEscape) + // allow . in names but not .. + key = strings.ReplaceAll(key, "..", "%2E%2E") + return key +} + +var errNotFound = fmt.Errorf("not found") +var errExpired = fmt.Errorf("expired") diff --git a/internal/cache/filesystem_test.go b/internal/cache/filesystem_test.go new file mode 100644 index 00000000000..721cb9f6aca --- /dev/null +++ b/internal/cache/filesystem_test.go @@ -0,0 +1,94 @@ +package cache + +import ( + "io" + "net/url" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/anchore/syft/internal" +) + +func Test_filesystemCache(t *testing.T) { + dir := t.TempDir() + man, err := NewFromDir(dir, 1*time.Minute) + require.NoError(t, err) + + cacheName := "test" + cacheVersion := "v1" + cache := man.GetCache(cacheName, cacheVersion) + + cacheKey := "test-key" + contentsValue := "some contents to cache" + + err = cache.Write(cacheKey, strings.NewReader(contentsValue)) + require.NoError(t, err) + + rdr, err := cache.Read(cacheKey) + require.NoError(t, err) + defer internal.CloseAndLogError(rdr, cacheKey) + + contents, err := io.ReadAll(rdr) + require.NoError(t, err) + require.Equal(t, contentsValue, string(contents)) + + // check the contents were actually written to disk as expected + contents, err = os.ReadFile(filepath.Join(dir, cacheName, cacheVersion, cacheKey)) + require.NoError(t, err) + require.Equal(t, contentsValue, string(contents)) + + _, err = cache.Read("otherKey") + require.ErrorIs(t, err, errNotFound) +} + +func Test_makeDiskKey(t *testing.T) { + tests := []struct { + in string + expected string + }{ + { + in: "", + expected: "", + }, + { + in: ".", + expected: "%2E", + }, + { + in: "..", + expected: "%2E%2E", + }, + { + in: "github.com", + expected: "github.com", + }, + { + in: "../github.com", + expected: "%2E%2E/github.com", + }, + { + in: "github.com/../..", + expected: "github.com/%2E%2E/%2E%2E", + }, + { + in: "github.com/%2E../..", + expected: "github.com/%252E%2E%2E/%2E%2E", + }, + } + for _, test := range tests { + t.Run(test.in, func(t *testing.T) { + got := makeDiskKey(test.in) + // validate appropriate escaping + require.Equal(t, test.expected, got) + // also validate that unescaped string matches original + unescaped, err := url.QueryUnescape(got) + require.NoError(t, err) + require.Equal(t, test.in, unescaped) + }) + } +} diff --git a/internal/cache/hash_type.go b/internal/cache/hash_type.go new file mode 100644 index 00000000000..c7b3c4a4233 --- /dev/null +++ b/internal/cache/hash_type.go @@ -0,0 +1,71 @@ +package cache + +import ( + "fmt" + "reflect" + + "github.com/mitchellh/hashstructure/v2" +) + +// hashType returns a stable hash based on the structure of the type +func hashType[T any]() string { + // get the base type and hash an empty instance + var t T + empty := emptyValue(reflect.TypeOf(t)).Interface() + hash, err := hashstructure.Hash(empty, hashstructure.FormatV2, &hashstructure.HashOptions{ + ZeroNil: false, + IgnoreZeroValue: false, + SlicesAsSets: false, + UseStringer: false, + }) + if err != nil { + panic(fmt.Errorf("unable to use type as cache key: %w", err)) + } + return fmt.Sprintf("%x", hash) +} + +func emptyValue(t reflect.Type) reflect.Value { + switch t.Kind() { + case reflect.Pointer: + e := t.Elem() + v := emptyValue(e) + if v.CanAddr() { + return v.Addr() + } + ptrv := reflect.New(e) + ptrv.Elem().Set(v) + return ptrv + case reflect.Slice: + v := emptyValue(t.Elem()) + s := reflect.MakeSlice(t, 1, 1) + s.Index(0).Set(v) + return s + case reflect.Struct: + v := reflect.New(t).Elem() + // get all empty field values, too + for i := 0; i < v.NumField(); i++ { + f := t.Field(i) + if isIgnored(f) { + continue + } + fv := v.Field(i) + if fv.CanSet() { + fv.Set(emptyValue(f.Type)) + } + } + return v + default: + return reflect.New(t).Elem() + } +} + +func isIgnored(f reflect.StructField) bool { + if !f.IsExported() { + return true + } + tag := f.Tag.Get("hash") + if tag == "-" || tag == "ignore" { + return true + } + return false +} diff --git a/internal/cache/hash_type_test.go b/internal/cache/hash_type_test.go new file mode 100644 index 00000000000..3fe24cbd3ee --- /dev/null +++ b/internal/cache/hash_type_test.go @@ -0,0 +1,123 @@ +package cache + +import ( + "fmt" + "testing" + + "github.com/mitchellh/hashstructure/v2" + "github.com/stretchr/testify/require" +) + +func Test_hashType(t *testing.T) { + type t1 struct { + Name string + } + type t2 struct { + Name string + } + type generic[T any] struct { + Val T + } + tests := []struct { + name string + hash func() string + expected string + }{ + { + name: "struct 1", + hash: func() string { return hashType[t1]() }, + expected: "d106c3ffbf98a0b1", + }, + { + name: "slice of struct 1", + hash: func() string { return hashType[[]t1]() }, + expected: "8122ace4ee1af0b4", + }, + { + name: "slice of struct 2", + hash: func() string { return hashType[[]t2]() }, + expected: "8cc04b5808be5bf9", + }, + { + name: "ptr 1", + hash: func() string { return hashType[*t1]() }, + expected: "d106c3ffbf98a0b1", // same hash as t1, which is ok since the structs are the same + }, + { + name: "slice of ptr 1", + hash: func() string { return hashType[[]*t1]() }, + expected: "8122ace4ee1af0b4", // same hash as []t1, again underlying serialization is the same + }, + { + name: "slice of ptr 2", + hash: func() string { return hashType[[]*t2]() }, + expected: "8cc04b5808be5bf9", // same hash as []t2, underlying serialization is the same + }, + { + name: "slice of ptr of slice of ptr", + hash: func() string { return hashType[[]*[]*t1]() }, + expected: "500d9f5b3a5977ce", + }, + { + name: "generic 1", + hash: func() string { return hashType[generic[t1]]() }, + expected: "b5fbb30e24400e81", + }, + { + name: "generic 2", + hash: func() string { return hashType[generic[t2]]() }, + expected: "becdb767c6b22bfa", + }, + { + name: "generic with ptr 1", + hash: func() string { return hashType[generic[*t1]]() }, + expected: "30c8855bf290fd83", + }, + { + name: "generic with ptr 2", + hash: func() string { return hashType[generic[*t2]]() }, + expected: "b66366b6ce9e6361", + }, + { + name: "generic with slice 1", + hash: func() string { return hashType[generic[[]t1]]() }, + expected: "d2ed158942fa6c29", + }, + { + name: "generic with slice 2", + hash: func() string { return hashType[generic[[]t2]]() }, + expected: "7a7bec575871c179", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + require.Equal(t, test.expected, test.hash()) + }) + } +} + +func Test_hashIgnores(t *testing.T) { + hash := func(v any) string { + v, err := hashstructure.Hash(v, hashstructure.FormatV2, &hashstructure.HashOptions{}) + require.NoError(t, err) + return fmt.Sprintf("%x", v) + } + type t1 struct { + Name string + notExported string + } + require.Equal(t, hash(t1{notExported: "a value"}), hashType[t1]()) + + type t2 struct { + Name string + Exported string `hash:"ignore"` + } + require.Equal(t, hash(t2{Exported: "another value"}), hashType[t2]()) + + type t3 struct { + Name string + Exported string `hash:"-"` + } + require.Equal(t, hash(t3{Exported: "still valued"}), hashType[t3]()) +} diff --git a/internal/cache/memory.go b/internal/cache/memory.go new file mode 100644 index 00000000000..247e67e0d4b --- /dev/null +++ b/internal/cache/memory.go @@ -0,0 +1,16 @@ +package cache + +import ( + "time" + + "github.com/spf13/afero" +) + +// NewInMemory returns an in-memory only cache manager +func NewInMemory(ttl time.Duration) Manager { + return &filesystemCache{ + dir: "", + fs: afero.NewMemMapFs(), + ttl: ttl, + } +} diff --git a/internal/cache/memory_test.go b/internal/cache/memory_test.go new file mode 100644 index 00000000000..e8bcbef7fa2 --- /dev/null +++ b/internal/cache/memory_test.go @@ -0,0 +1,37 @@ +package cache + +import ( + "io" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/anchore/syft/internal" +) + +func Test_NewInMemory(t *testing.T) { + man := NewInMemory(time.Hour) + + cacheName := "test" + cacheVersion := "v1" + cache := man.GetCache(cacheName, cacheVersion) + + cacheKey := "test-key" + contentsValue := "some contents to cache" + + err := cache.Write(cacheKey, strings.NewReader(contentsValue)) + require.NoError(t, err) + + rdr, err := cache.Read(cacheKey) + require.NoError(t, err) + defer internal.CloseAndLogError(rdr, cacheKey) + + contents, err := io.ReadAll(rdr) + require.NoError(t, err) + require.Equal(t, contentsValue, string(contents)) + + _, err = cache.Read("otherKey") + require.ErrorIs(t, err, errNotFound) +} diff --git a/internal/cache/resolver.go b/internal/cache/resolver.go new file mode 100644 index 00000000000..68af21a2be5 --- /dev/null +++ b/internal/cache/resolver.go @@ -0,0 +1,83 @@ +package cache + +import ( + "bytes" + "encoding/json" + "fmt" + "path" + + "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/log" +) + +// Resolver interface provides a single Resolve method, which will return from cache +// or call the provided resolve function to get the value if not available in cache +type Resolver[T any] interface { + // Resolve attempts to resolve the given key from cache and convert it to the type of the cache, + // or calls the resolver function if unable to resolve a cached value + Resolve(key string, resolver resolverFunc[T]) (T, error) +} + +// GetResolver returns a cache resolver for persistent cached data across Syft runs, stored in a unique +// location based on the provided name and versioned by the type +func GetResolver[T any](name, version string) Resolver[T] { + typeHash := hashType[T]() + versionKey := path.Join(version, typeHash) + return &cacheResolver[T]{ + name: fmt.Sprintf("%s/%s", name, versionKey), + cache: manager.GetCache(name, versionKey), + } +} + +const resolverKeySuffix = ".json" + +type resolverFunc[T any] func() (T, error) + +type cacheResolver[T any] struct { + name string + cache Cache +} + +var _ interface { + Resolver[int] +} = (*cacheResolver[int])(nil) + +func (r *cacheResolver[T]) Resolve(key string, resolver resolverFunc[T]) (T, error) { + key += resolverKeySuffix + + rdr, err := r.cache.Read(key) + if rdr == nil || err != nil { + return r.resolveAndCache(key, resolver) + } + defer internal.CloseAndLogError(rdr, key) + + dec := json.NewDecoder(rdr) + if dec == nil { + log.Tracef("error getting cache json decoder for %s %v: %v", r.name, key, err) + return r.resolveAndCache(key, resolver) + } + var t T + err = dec.Decode(&t) + if err != nil { + log.Tracef("error decoding cached entry for %s %v: %v", r.name, key, err) + return r.resolveAndCache(key, resolver) + } + // no error, able to resolve from cache + return t, nil +} + +func (r *cacheResolver[T]) resolveAndCache(key string, resolver func() (T, error)) (T, error) { + t, err := resolver() + if err != nil { + return t, err + } + var data bytes.Buffer + enc := json.NewEncoder(&data) + enc.SetEscapeHTML(false) + err = enc.Encode(t) + if err != nil { + return t, err + } + err = r.cache.Write(key, &data) + return t, err +} diff --git a/internal/cache/resolver_test.go b/internal/cache/resolver_test.go new file mode 100644 index 00000000000..4a74d052616 --- /dev/null +++ b/internal/cache/resolver_test.go @@ -0,0 +1,92 @@ +package cache + +import ( + "encoding/json" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func Test_Resolver(t *testing.T) { + original := GetManager() + defer SetManager(original) + SetManager(NewInMemory(time.Hour)) + + type sub struct { + Name string + Value bool + } + + type thing struct { + Value string + Values []int + Subs []*sub + } + + versionHash := hashType[thing]() + cache := GetManager().GetCache("test", "v7/"+versionHash) + + resolver := GetResolver[thing]("test", "v7") + require.NotNil(t, resolver) + + require.IsType(t, &cacheResolver[thing]{}, resolver) + cr := resolver.(*cacheResolver[thing]) + + require.IsType(t, cache, cr.cache) + + resolveErrCount := 0 + resolveThingErr := func() (thing, error) { + resolveErrCount++ + return thing{}, fmt.Errorf("an error") + } + + _, err := resolver.Resolve("err", resolveThingErr) + require.ErrorContains(t, err, "an error") + require.Equal(t, 1, resolveErrCount) + + _, err = resolver.Resolve("err", resolveThingErr) + require.ErrorContains(t, err, "an error") + require.Equal(t, 2, resolveErrCount) + + aThing := thing{ + Value: "a value", + Values: []int{7, 8, 9}, + Subs: []*sub{ + { + Name: "sub1", + Value: true, + }, + { + Name: "sub2", + Value: false, + }, + }, + } + + resolveThingCount := 0 + resolveThing := func() (thing, error) { + resolveThingCount++ + return aThing, nil + } + + val, err := resolver.Resolve("thing", resolveThing) + require.NoError(t, err) + require.Equal(t, 1, resolveThingCount) + require.Equal(t, aThing, val) + + val, err = resolver.Resolve("thing", resolveThing) + require.NoError(t, err) + require.Equal(t, 1, resolveThingCount) + require.Equal(t, aThing, val) + + rdr, err := cache.Read("thing" + resolverKeySuffix) + require.NoError(t, err) + decoder := json.NewDecoder(rdr) + + var val2 thing + err = decoder.Decode(&val2) + require.NoError(t, err) + require.Equal(t, aThing, val2) +} diff --git a/syft/pkg/cataloger/golang/cataloger.go b/syft/pkg/cataloger/golang/cataloger.go index 6798b15a213..c7d483faa09 100644 --- a/syft/pkg/cataloger/golang/cataloger.go +++ b/syft/pkg/cataloger/golang/cataloger.go @@ -20,12 +20,8 @@ const ( // NewGoModuleFileCataloger returns a new cataloger object that searches within go.mod files. func NewGoModuleFileCataloger(opts CatalogerConfig) pkg.Cataloger { - c := goModCataloger{ - licenses: newGoLicenses(modFileCatalogerName, opts), - } - return generic.NewCataloger(modFileCatalogerName). - WithParserByGlobs(c.parseGoModFile, "**/go.mod") + WithParserByGlobs(newGoModCataloger(opts).parseGoModFile, "**/go.mod") } // NewGoModuleBinaryCataloger returns a new cataloger object that searches within binaries built by the go compiler. diff --git a/syft/pkg/cataloger/golang/config.go b/syft/pkg/cataloger/golang/config.go index e55ed5d0bda..aa7bc7708a1 100644 --- a/syft/pkg/cataloger/golang/config.go +++ b/syft/pkg/cataloger/golang/config.go @@ -41,6 +41,7 @@ type MainModuleVersionConfig struct { func DefaultCatalogerConfig() CatalogerConfig { g := CatalogerConfig{ MainModuleVersion: DefaultMainModuleVersionConfig(), + LocalModCacheDir: defaultGoModDir(), } // first process the proxy settings @@ -67,22 +68,23 @@ func DefaultCatalogerConfig() CatalogerConfig { } } - if g.LocalModCacheDir == "" { - goPath := os.Getenv("GOPATH") + return g +} - if goPath == "" { - homeDir, err := homedir.Dir() - if err != nil { - log.Debug("unable to determine user home dir: %v", err) - } else { - goPath = filepath.Join(homeDir, "go") - } - } - if goPath != "" { - g.LocalModCacheDir = filepath.Join(goPath, "pkg", "mod") +// defaultGoModDir returns $GOPATH/pkg/mod or $HOME/go/pkg/mod based on environment variables available +func defaultGoModDir() string { + goPath := os.Getenv("GOPATH") + + if goPath == "" { + homeDir, err := homedir.Dir() + if err != nil { + log.Warnf("unable to determine GOPATH or user home dir: %w", err) + return "" } + goPath = filepath.Join(homeDir, "go") } - return g + + return filepath.Join(goPath, "pkg", "mod") } func DefaultMainModuleVersionConfig() MainModuleVersionConfig { diff --git a/syft/pkg/cataloger/golang/config_test.go b/syft/pkg/cataloger/golang/config_test.go index 5719b312fae..2720dc1b33c 100644 --- a/syft/pkg/cataloger/golang/config_test.go +++ b/syft/pkg/cataloger/golang/config_test.go @@ -1,6 +1,7 @@ package golang import ( + "path/filepath" "testing" "github.com/mitchellh/go-homedir" @@ -47,7 +48,7 @@ func Test_Config(t *testing.T) { opts: opts{}, expected: CatalogerConfig{ SearchLocalModCacheLicenses: false, - LocalModCacheDir: "/go/pkg/mod", + LocalModCacheDir: filepath.Join("/go", "pkg", "mod"), SearchRemoteLicenses: false, Proxies: []string{"https://my.proxy"}, NoProxy: []string{"my.private", "no.proxy"}, diff --git a/syft/pkg/cataloger/golang/licenses.go b/syft/pkg/cataloger/golang/licenses.go index 5d9ba7734b5..911d386cc42 100644 --- a/syft/pkg/cataloger/golang/licenses.go +++ b/syft/pkg/cataloger/golang/licenses.go @@ -21,25 +21,41 @@ import ( "github.com/scylladb/go-set/strset" "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/cache" "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/file" - "github.com/anchore/syft/syft/internal/fileresolver" + "github.com/anchore/syft/syft/license" "github.com/anchore/syft/syft/pkg" ) -type goLicenses struct { +type goLicense struct { + Value string `json:"val,omitempty"` + SPDXExpression string `json:"spdx,omitempty"` + Type license.Type `json:"type,omitempty"` + URLs []string `json:"urls,omitempty"` + Locations []string `json:"locations,omitempty"` +} + +type goLicenseResolver struct { catalogerName string opts CatalogerConfig - localModCacheResolver file.WritableResolver + localModCacheDir fs.FS + licenseCache cache.Resolver[[]goLicense] lowerLicenseFileNames *strset.Set } -func newGoLicenses(catalogerName string, opts CatalogerConfig) goLicenses { - return goLicenses{ +func newGoLicenseResolver(catalogerName string, opts CatalogerConfig) goLicenseResolver { + var localModCacheDir fs.FS + if opts.SearchLocalModCacheLicenses { + localModCacheDir = os.DirFS(opts.LocalModCacheDir) + } + + return goLicenseResolver{ catalogerName: catalogerName, opts: opts, - localModCacheResolver: modCacheResolver(opts.LocalModCacheDir), + localModCacheDir: localModCacheDir, + licenseCache: cache.GetResolverCachingErrors[[]goLicense]("golang", "v1"), lowerLicenseFileNames: strset.New(lowercaseLicenseFiles()...), } } @@ -63,98 +79,95 @@ func remotesForModule(proxies []string, noProxy []string, module string) []strin return proxies } -func modCacheResolver(modCacheDir string) file.WritableResolver { - var r file.WritableResolver - - if modCacheDir == "" { - log.Trace("unable to determine mod cache directory, skipping mod cache resolver") - r = fileresolver.Empty{} - } else { - stat, err := os.Stat(modCacheDir) - - if os.IsNotExist(err) || stat == nil || !stat.IsDir() { - log.Tracef("unable to open mod cache directory: %s, skipping mod cache resolver", modCacheDir) - r = fileresolver.Empty{} - } else { - r = fileresolver.NewFromUnindexedDirectory(modCacheDir) - } - } - - return r -} - -func (c *goLicenses) getLicenses(resolver file.Resolver, moduleName, moduleVersion string) (licenses []pkg.License, err error) { - licenses, err = c.findLicenses(resolver, +func (c *goLicenseResolver) getLicenses(resolver file.Resolver, moduleName, moduleVersion string) ([]pkg.License, error) { + // search the scan target first, ignoring local and remote sources + goLicenses, err := c.findLicensesInSource(resolver, fmt.Sprintf(`**/go/pkg/mod/%s@%s/*`, processCaps(moduleName), moduleVersion), ) - if err != nil || len(licenses) > 0 { - return requireCollection(licenses), err + if err != nil || len(goLicenses) > 0 { + return toPkgLicenses(goLicenses), err } - // look in the local host mod cache... - licenses, err = c.getLicensesFromLocal(moduleName, moduleVersion) - if err != nil || len(licenses) > 0 { - return requireCollection(licenses), err + // look in the local host mod directory... + if c.opts.SearchLocalModCacheLicenses { + goLicenses, err = c.getLicensesFromLocal(moduleName, moduleVersion) + if err != nil || len(goLicenses) > 0 { + return toPkgLicenses(goLicenses), err + } } - // we did not find it yet and remote searching was enabled - licenses, err = c.getLicensesFromRemote(moduleName, moduleVersion) - return requireCollection(licenses), err -} - -func (c *goLicenses) getLicensesFromLocal(moduleName, moduleVersion string) ([]pkg.License, error) { - if !c.opts.SearchLocalModCacheLicenses { - return nil, nil + // download from remote sources + if c.opts.SearchRemoteLicenses { + goLicenses, err = c.getLicensesFromRemote(moduleName, moduleVersion) } - // if we're running against a directory on the filesystem, it may not include the - // user's homedir / GOPATH, so we defer to using the localModCacheResolver - return c.findLicenses(c.localModCacheResolver, moduleSearchGlob(moduleName, moduleVersion)) + return toPkgLicenses(goLicenses), err } -func (c *goLicenses) getLicensesFromRemote(moduleName, moduleVersion string) ([]pkg.License, error) { - if !c.opts.SearchRemoteLicenses { +func (c *goLicenseResolver) getLicensesFromLocal(moduleName, moduleVersion string) ([]goLicense, error) { + if c.localModCacheDir == nil { return nil, nil } - proxies := remotesForModule(c.opts.Proxies, c.opts.NoProxy, moduleName) + subdir := moduleDir(moduleName, moduleVersion) - fsys, err := getModule(proxies, moduleName, moduleVersion) + // get the local subdirectory containing the specific go module + dir, err := fs.Sub(c.localModCacheDir, subdir) if err != nil { return nil, err } - dir := moduleDir(moduleName, moduleVersion) + // if we're running against a directory on the filesystem, it may not include the + // user's homedir / GOPATH, so we defer to using the localModCacheResolver + // we use $GOPATH/pkg/mod to avoid leaking information about the user's system + return c.findLicensesInFS("file://$GOPATH/pkg/mod/"+subdir+"/", dir) +} + +func (c *goLicenseResolver) getLicensesFromRemote(moduleName, moduleVersion string) ([]goLicense, error) { + return c.licenseCache.Resolve(fmt.Sprintf("%s/%s", moduleName, moduleVersion), func() ([]goLicense, error) { + proxies := remotesForModule(c.opts.Proxies, c.opts.NoProxy, moduleName) - // populate the mod cache with the results - err = fs.WalkDir(fsys, ".", func(filePath string, d fs.DirEntry, err error) error { + urlPrefix, fsys, err := getModule(proxies, moduleName, moduleVersion) if err != nil { - log.Debug(err) + return nil, err + } + + return c.findLicensesInFS(urlPrefix, fsys) + }) +} + +func (c *goLicenseResolver) findLicensesInFS(urlPrefix string, fsys fs.FS) ([]goLicense, error) { + var out []goLicense + err := fs.WalkDir(fsys, ".", func(filePath string, d fs.DirEntry, _ error) error { + if !c.lowerLicenseFileNames.Has(strings.ToLower(d.Name())) { return nil } - if d.IsDir() { + rdr, err := fsys.Open(filePath) + if err != nil { + log.Debugf("error opening license file %s: %v", filePath, err) return nil } - f, err := fsys.Open(filePath) + defer internal.CloseAndLogError(rdr, filePath) + parsed, err := licenses.Parse(rdr, file.NewLocation(filePath)) if err != nil { - return err + log.Debugf("error parsing license file %s: %v", filePath, err) + return nil + } + // since these licenses are found in an external fs.FS, not in the scanned source, + // get rid of the locations but keep information about the where the license was found + // by prepending the urlPrefix to the internal path for an accurate representation + for _, l := range toGoLicenses(parsed) { + l.URLs = []string{urlPrefix + filePath} + l.Locations = nil + out = append(out, l) } - return c.localModCacheResolver.Write(file.NewLocation(path.Join(dir, filePath)), f) + return nil }) - - if err != nil { - log.Tracef("remote proxy walk failed for: %s", moduleName) - } - - return c.findLicenses(c.localModCacheResolver, moduleSearchGlob(moduleName, moduleVersion)) + return out, err } -func (c *goLicenses) findLicenses(resolver file.Resolver, globMatch string) (out []pkg.License, err error) { - out = make([]pkg.License, 0) - if resolver == nil { - return - } - +func (c *goLicenseResolver) findLicensesInSource(resolver file.Resolver, globMatch string) ([]goLicense, error) { + var out []goLicense locations, err := resolver.FilesByGlob(globMatch) if err != nil { return nil, err @@ -168,11 +181,19 @@ func (c *goLicenses) findLicenses(resolver file.Resolver, globMatch string) (out out = append(out, parsed...) } - return + // if we have a directory but simply don't have any found license files, indicate this so we + // don't re-download modules continually + if len(locations) > 0 && len(out) == 0 { + return nil, noLicensesFound{ + glob: globMatch, + } + } + + return out, nil } -func (c *goLicenses) parseLicenseFromLocation(l file.Location, resolver file.Resolver) ([]pkg.License, error) { - var out []pkg.License +func (c *goLicenseResolver) parseLicenseFromLocation(l file.Location, resolver file.Resolver) ([]goLicense, error) { + var out []goLicense fileName := path.Base(l.RealPath) if c.lowerLicenseFileNames.Has(strings.ToLower(fileName)) { contents, err := resolver.FileContentsByLocation(l) @@ -185,7 +206,7 @@ func (c *goLicenses) parseLicenseFromLocation(l file.Location, resolver file.Res return nil, err } - out = append(out, parsed...) + out = append(out, toGoLicenses(parsed)...) } return out, nil } @@ -194,13 +215,9 @@ func moduleDir(moduleName, moduleVersion string) string { return fmt.Sprintf("%s@%s", processCaps(moduleName), moduleVersion) } -func moduleSearchGlob(moduleName, moduleVersion string) string { - return fmt.Sprintf("%s/*", moduleDir(moduleName, moduleVersion)) -} - -func requireCollection(licenses []pkg.License) []pkg.License { +func requireCollection[T any](licenses []T) []T { if licenses == nil { - return make([]pkg.License, 0) + return make([]T, 0) } return licenses } @@ -213,18 +230,19 @@ func processCaps(s string) string { }) } -func getModule(proxies []string, moduleName, moduleVersion string) (fsys fs.FS, err error) { +func getModule(proxies []string, moduleName, moduleVersion string) (urlPrefix string, fsys fs.FS, err error) { for _, proxy := range proxies { u, _ := url.Parse(proxy) if proxy == "direct" { - fsys, err = getModuleRepository(moduleName, moduleVersion) + urlPrefix, fsys, err = getModuleRepository(moduleName, moduleVersion) continue } switch u.Scheme { case "https", "http": - fsys, err = getModuleProxy(proxy, moduleName, moduleVersion) + urlPrefix, fsys, err = getModuleProxy(proxy, moduleName, moduleVersion) case "file": p := filepath.Join(u.Path, moduleName, "@v", moduleVersion) + urlPrefix = path.Join("file://", p) + "/" fsys = os.DirFS(p) } if fsys != nil { @@ -234,13 +252,13 @@ func getModule(proxies []string, moduleName, moduleVersion string) (fsys fs.FS, return } -func getModuleProxy(proxy string, moduleName string, moduleVersion string) (out fs.FS, _ error) { +func getModuleProxy(proxy string, moduleName string, moduleVersion string) (moduleURL string, out fs.FS, _ error) { u := fmt.Sprintf("%s/%s/@v/%s.zip", proxy, moduleName, moduleVersion) // get the module zip resp, err := http.Get(u) //nolint:gosec if err != nil { - return nil, err + return "", nil, err } defer func() { _ = resp.Body.Close() }() @@ -250,25 +268,25 @@ func getModuleProxy(proxy string, moduleName string, moduleVersion string) (out // try lowercasing it; some packages have mixed casing that really messes up the proxy resp, err = http.Get(u) //nolint:gosec if err != nil { - return nil, err + return "", nil, err } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("failed to get module zip: %s", resp.Status) + return "", nil, fmt.Errorf("failed to get module zip: %s", resp.Status) } } // read the zip b, err := io.ReadAll(resp.Body) if err != nil { - return nil, err + return "", nil, err } out, err = zip.NewReader(bytes.NewReader(b), resp.ContentLength) versionPath := findVersionPath(out, ".") out = getSubFS(out, versionPath) - return out, err + return u + "#" + versionPath + "/", out, err } func findVersionPath(f fs.FS, dir string) string { @@ -288,26 +306,111 @@ func findVersionPath(f fs.FS, dir string) string { return "" } -func getModuleRepository(moduleName string, moduleVersion string) (fs.FS, error) { +func getModuleRepository(moduleName string, moduleVersion string) (string, fs.FS, error) { repoName := moduleName parts := strings.Split(moduleName, "/") if len(parts) > 2 { repoName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[2]) } + // see if there's a hash and use that if so, otherwise use a tag + splitVersion := strings.Split(moduleVersion, "-") + var cloneRefName plumbing.ReferenceName + refPath := "" + if len(splitVersion) < 3 { + tagName := splitVersion[0] + cloneRefName = plumbing.NewTagReferenceName(tagName) + refPath = "/tags/" + tagName + } + f := memfs.New() buf := &bytes.Buffer{} - _, err := git.Clone(memory.NewStorage(), f, &git.CloneOptions{ - URL: fmt.Sprintf("https://%s", repoName), - ReferenceName: plumbing.NewTagReferenceName(moduleVersion), // FIXME version might be a SHA + repoURL := fmt.Sprintf("https://%s", repoName) + r, err := git.Clone(memory.NewStorage(), f, &git.CloneOptions{ + URL: repoURL, + ReferenceName: cloneRefName, SingleBranch: true, Depth: 1, Progress: buf, }) - if err != nil { - return nil, fmt.Errorf("%w -- %s", err, buf.String()) + return "", nil, fmt.Errorf("%w -- %s", err, buf.String()) + } + + if len(splitVersion) > 2 { + sha := splitVersion[len(splitVersion)-1] + hash, err := r.ResolveRevision(plumbing.Revision(sha)) + if err != nil || hash == nil { + log.Tracef("unable to resolve hash %s: %v", sha, err) + } else { + w, err := r.Worktree() + if err != nil { + log.Tracef("unable to get worktree, using default: %v", err) + } + err = w.Checkout(&git.CheckoutOptions{ + Hash: *hash, + }) + if err != nil { + log.Tracef("unable to checkout commit, using default: %v", err) + } else { + refPath = "/refs/" + hash.String() + } + } } - return billyFSAdapter{fs: f}, nil + return repoURL + refPath + "/", billyFSAdapter{fs: f}, err +} + +type noLicensesFound struct { + glob string +} + +func (l noLicensesFound) Error() string { + return fmt.Sprintf("unable to find license information matching: %s", l.glob) +} + +var _ error = (*noLicensesFound)(nil) + +func toPkgLicenses(goLicenses []goLicense) []pkg.License { + var out []pkg.License + for _, l := range goLicenses { + out = append(out, pkg.License{ + Value: l.Value, + SPDXExpression: l.SPDXExpression, + Type: l.Type, + URLs: l.URLs, + Locations: toPkgLocations(l.Locations), + }) + } + return requireCollection(out) +} + +func toPkgLocations(goLocations []string) file.LocationSet { + out := file.NewLocationSet() + for _, l := range goLocations { + out.Add(file.NewLocation(l)) + } + return out +} + +func toGoLicenses(pkgLicenses []pkg.License) []goLicense { + var out []goLicense + for _, l := range pkgLicenses { + out = append(out, goLicense{ + Value: l.Value, + SPDXExpression: l.SPDXExpression, + Type: l.Type, + URLs: l.URLs, + Locations: toGoLocations(l.Locations), + }) + } + return out +} + +func toGoLocations(locations file.LocationSet) []string { + var out []string + for _, l := range locations.ToSlice() { + out = append(out, l.RealPath) + } + return out } diff --git a/syft/pkg/cataloger/golang/licenses_test.go b/syft/pkg/cataloger/golang/licenses_test.go index a4d1a4a9bb0..d34d59e5a95 100644 --- a/syft/pkg/cataloger/golang/licenses_test.go +++ b/syft/pkg/cataloger/golang/licenses_test.go @@ -8,6 +8,7 @@ import ( "net/http/httptest" "os" "path" + "path/filepath" "strings" "testing" @@ -36,7 +37,8 @@ func Test_LocalLicenseSearch(t *testing.T) { Value: "Apache-2.0", SPDXExpression: "Apache-2.0", Type: license.Concluded, - Locations: file.NewLocationSet(loc1), + URLs: []string{"file://$GOPATH/pkg/mod/" + loc1.RealPath}, + Locations: file.NewLocationSet(), }, }, { @@ -46,7 +48,8 @@ func Test_LocalLicenseSearch(t *testing.T) { Value: "MIT", SPDXExpression: "MIT", Type: license.Concluded, - Locations: file.NewLocationSet(loc2), + URLs: []string{"file://$GOPATH/pkg/mod/" + loc2.RealPath}, + Locations: file.NewLocationSet(), }, }, { @@ -56,7 +59,8 @@ func Test_LocalLicenseSearch(t *testing.T) { Value: "Apache-2.0", SPDXExpression: "Apache-2.0", Type: license.Concluded, - Locations: file.NewLocationSet(loc3), + URLs: []string{"file://$GOPATH/pkg/mod/" + loc3.RealPath}, + Locations: file.NewLocationSet(), }, }, } @@ -66,11 +70,11 @@ func Test_LocalLicenseSearch(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - l := newGoLicenses( + l := newGoLicenseResolver( "", CatalogerConfig{ SearchLocalModCacheLicenses: true, - LocalModCacheDir: path.Join(wd, "test-fixtures", "licenses", "pkg", "mod"), + LocalModCacheDir: filepath.Join(wd, "test-fixtures", "licenses", "pkg", "mod"), }, ) licenses, err := l.getLicenses(fileresolver.Empty{}, test.name, test.version) @@ -97,7 +101,7 @@ func Test_RemoteProxyLicenseSearch(t *testing.T) { wd, err := os.Getwd() require.NoError(t, err) - testDir := path.Join(wd, "test-fixtures", "licenses", "pkg", "mod", processCaps(modPath)+"@"+modVersion) + testDir := filepath.Join(wd, "test-fixtures", "licenses", "pkg", "mod", processCaps(modPath)+"@"+modVersion) archive := zip.NewWriter(buf) @@ -106,9 +110,9 @@ func Test_RemoteProxyLicenseSearch(t *testing.T) { for _, f := range entries { // the zip files downloaded contain a path to the repo that somewhat matches where it ends up on disk, // so prefix entries with something similar - writer, err := archive.Create(path.Join("github.com/something/some@version", f.Name())) + writer, err := archive.Create(path.Join(moduleDir(modPath, modVersion), f.Name())) require.NoError(t, err) - contents, err := os.ReadFile(path.Join(testDir, f.Name())) + contents, err := os.ReadFile(filepath.Join(testDir, f.Name())) require.NoError(t, err) _, err = writer.Write(contents) require.NoError(t, err) @@ -136,7 +140,8 @@ func Test_RemoteProxyLicenseSearch(t *testing.T) { Value: "Apache-2.0", SPDXExpression: "Apache-2.0", Type: license.Concluded, - Locations: file.NewLocationSet(loc1), + URLs: []string{server.URL + "/github.com/someorg/somename/@v/v0.3.2.zip#" + loc1.RealPath}, + Locations: file.NewLocationSet(), }, }, { @@ -146,21 +151,20 @@ func Test_RemoteProxyLicenseSearch(t *testing.T) { Value: "MIT", SPDXExpression: "MIT", Type: license.Concluded, - Locations: file.NewLocationSet(loc2), + URLs: []string{server.URL + "/github.com/CapORG/CapProject/@v/v4.111.5.zip#" + loc2.RealPath}, + Locations: file.NewLocationSet(), }, }, } - modDir := path.Join(t.TempDir()) - for _, test := range tests { t.Run(test.name, func(t *testing.T) { - l := newGoLicenses( + + l := newGoLicenseResolver( "", CatalogerConfig{ SearchRemoteLicenses: true, Proxies: []string{server.URL}, - LocalModCacheDir: modDir, }, ) diff --git a/syft/pkg/cataloger/golang/package.go b/syft/pkg/cataloger/golang/package.go index 00c4f790989..8946b5f06f5 100644 --- a/syft/pkg/cataloger/golang/package.go +++ b/syft/pkg/cataloger/golang/package.go @@ -15,7 +15,7 @@ func (c *goBinaryCataloger) newGoBinaryPackage(resolver file.Resolver, dep *debu dep = dep.Replace } - licenses, err := c.licenses.getLicenses(resolver, dep.Path, dep.Version) + licenses, err := c.licenseResolver.getLicenses(resolver, dep.Path, dep.Version) if err != nil { log.Tracef("error getting licenses for golang package: %s %v", dep.Path, err) } diff --git a/syft/pkg/cataloger/golang/parse_go_binary.go b/syft/pkg/cataloger/golang/parse_go_binary.go index f3ea9718008..c6906d15699 100644 --- a/syft/pkg/cataloger/golang/parse_go_binary.go +++ b/syft/pkg/cataloger/golang/parse_go_binary.go @@ -46,13 +46,13 @@ var ( const devel = "(devel)" type goBinaryCataloger struct { - licenses goLicenses + licenseResolver goLicenseResolver mainModuleVersion MainModuleVersionConfig } func newGoBinaryCataloger(opts CatalogerConfig) *goBinaryCataloger { return &goBinaryCataloger{ - licenses: newGoLicenses(binaryCatalogerName, opts), + licenseResolver: newGoLicenseResolver(binaryCatalogerName, opts), mainModuleVersion: opts.MainModuleVersion, } } diff --git a/syft/pkg/cataloger/golang/parse_go_mod.go b/syft/pkg/cataloger/golang/parse_go_mod.go index 1c06e873974..8faf4acb509 100644 --- a/syft/pkg/cataloger/golang/parse_go_mod.go +++ b/syft/pkg/cataloger/golang/parse_go_mod.go @@ -19,7 +19,13 @@ import ( ) type goModCataloger struct { - licenses goLicenses + licenseResolver goLicenseResolver +} + +func newGoModCataloger(opts CatalogerConfig) *goModCataloger { + return &goModCataloger{ + licenseResolver: newGoLicenseResolver(modFileCatalogerName, opts), + } } // parseGoModFile takes a go.mod and lists all packages discovered. @@ -44,7 +50,7 @@ func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolve } for _, m := range f.Require { - licenses, err := c.licenses.getLicenses(resolver, m.Mod.Path, m.Mod.Version) + licenses, err := c.licenseResolver.getLicenses(resolver, m.Mod.Path, m.Mod.Version) if err != nil { log.Tracef("error getting licenses for package: %s %v", m.Mod.Path, err) } @@ -65,7 +71,7 @@ func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolve // remove any old packages and replace with new ones... for _, m := range f.Replace { - licenses, err := c.licenses.getLicenses(resolver, m.New.Path, m.New.Version) + licenses, err := c.licenseResolver.getLicenses(resolver, m.New.Path, m.New.Version) if err != nil { log.Tracef("error getting licenses for package: %s %v", m.New.Path, err) } diff --git a/syft/pkg/cataloger/golang/parse_go_mod_test.go b/syft/pkg/cataloger/golang/parse_go_mod_test.go index 59a7164f1b7..19ae7f33b70 100644 --- a/syft/pkg/cataloger/golang/parse_go_mod_test.go +++ b/syft/pkg/cataloger/golang/parse_go_mod_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/internal/fileresolver" "github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest" ) @@ -91,10 +92,11 @@ func TestParseGoMod(t *testing.T) { for _, test := range tests { t.Run(test.fixture, func(t *testing.T) { - c := goModCataloger{} + c := newGoModCataloger(DefaultCatalogerConfig()) pkgtest.NewCatalogTester(). FromFile(t, test.fixture). Expects(test.expected, nil). + WithResolver(fileresolver.Empty{}). TestParser(t, c.parseGoModFile) }) } diff --git a/syft/source/directorysource/cache_excludes.go b/syft/source/directorysource/cache_excludes.go new file mode 100644 index 00000000000..59c7c490ed7 --- /dev/null +++ b/syft/source/directorysource/cache_excludes.go @@ -0,0 +1,33 @@ +package directorysource + +import ( + "os" + "strings" + + "github.com/anchore/syft/internal/cache" + "github.com/anchore/syft/internal/log" + "github.com/anchore/syft/syft/internal/fileresolver" +) + +// we do not want to cache things and then subsequently scan them, if, for example a user runs `syft /` twice +func excludeCachePathVisitors() []fileresolver.PathIndexVisitor { + var out []fileresolver.PathIndexVisitor + for _, dir := range cache.GetManager().RootDirs() { + out = append(out, excludeCacheDirPathVisitor{ + dir: dir, + }.excludeCacheDir) + } + return out +} + +type excludeCacheDirPathVisitor struct { + dir string +} + +func (d excludeCacheDirPathVisitor) excludeCacheDir(_, path string, _ os.FileInfo, _ error) error { + if strings.HasPrefix(path, d.dir) { + log.Tracef("skipping cache path: %s", path) + return fileresolver.ErrSkipPath + } + return nil +} diff --git a/syft/source/directorysource/directory_source.go b/syft/source/directorysource/directory_source.go index 2a4ab3705a1..0ab06980fe5 100644 --- a/syft/source/directorysource/directory_source.go +++ b/syft/source/directorysource/directory_source.go @@ -142,6 +142,9 @@ func (s *directorySource) FileResolver(_ source.Scope) (file.Resolver, error) { return nil, err } + // this should be the only file resolver that might have overlap with where files are cached + exclusionFunctions = append(exclusionFunctions, excludeCachePathVisitors()...) + res, err := fileresolver.NewFromDirectory(s.config.Path, s.config.Base, exclusionFunctions...) if err != nil { return nil, fmt.Errorf("unable to create directory resolver: %w", err)