diff --git a/cmd/sf/sf.go b/cmd/sf/sf.go index e28a337d2..f171e82c0 100644 --- a/cmd/sf/sf.go +++ b/cmd/sf/sf.go @@ -40,31 +40,32 @@ const maxMulti = 1024 // flags var ( - updateShort = flag.Bool("u", false, "update or install the default signature file") - update = flag.Bool("update", false, "update or install the default signature file") - versionShort = flag.Bool("v", false, "display version information") - version = flag.Bool("version", false, "display version information") - logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout") - nr = flag.Bool("nr", false, "prevent automatic directory recursion") - yaml = flag.Bool("yaml", true, "YAML output format") - csvo = flag.Bool("csv", false, "CSV output format") - jsono = flag.Bool("json", false, "JSON output format") - droido = flag.Bool("droid", false, "DROID CSV output format") - sig = flag.String("sig", config.SignatureBase(), "set the signature file") - home = flag.String("home", config.Home(), "override the default home directory") - serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138") - multi = flag.Int("multi", 1, "set number of parallel file ID processes") - archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes())) - hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices) - throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms") - utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ") - coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)") - replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml") - list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt") - name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -") - conff = flag.String("conf", "", "set the configuration file") - setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file") - sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata") + updateShort = flag.Bool("u", false, "update or install the default signature file") + update = flag.Bool("update", false, "update or install the default signature file") + versionShort = flag.Bool("v", false, "display version information") + version = flag.Bool("version", false, "display version information") + logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout") + nr = flag.Bool("nr", false, "prevent automatic directory recursion") + yaml = flag.Bool("yaml", true, "YAML output format") + csvo = flag.Bool("csv", false, "CSV output format") + jsono = flag.Bool("json", false, "JSON output format") + droido = flag.Bool("droid", false, "DROID CSV output format") + sig = flag.String("sig", config.SignatureBase(), "set the signature file") + home = flag.String("home", config.Home(), "override the default home directory") + serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138") + multi = flag.Int("multi", 1, "set number of parallel file ID processes") + archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes())) + selectArchives = flag.String("zs", config.ListAllArcTypes(), "select the archive types to decompress and identify the contents of") + hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices) + throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms") + utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ") + coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)") + replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml") + list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt") + name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -") + conff = flag.String("conf", "", "set the configuration file") + setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file") + sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata") ) var ( @@ -372,6 +373,10 @@ func main() { } return } + // handle -zs + if *selectArchives != "" { + config.SetArchiveFilterPermissive(*selectArchives) + } // handle -fpr if *fprflag { log.Printf("FPR server started at %s. Use CTRL-C to quit.\n", config.Fpr()) diff --git a/pkg/config/decompress.go b/pkg/config/decompress.go index 12a06457b..a3173a377 100644 --- a/pkg/config/decompress.go +++ b/pkg/config/decompress.go @@ -16,6 +16,7 @@ package config import ( "fmt" + "strings" ) // Archive is a file format capable of decompression by sf. @@ -106,8 +107,41 @@ func ListAllArcTypes() string { ) } -func (a Archive) String() string { - switch a { +var permissiveFilter []string + +// SetArchiveFilterPermissive enables a filter to be created on the +// types of archive that we want to extract from. Anything not in this +// list is not extracted. +func SetArchiveFilterPermissive(value string) []string { + arr := []string{} + arcList := strings.Split(value, ",") + for _, arc := range arcList { + switch strings.TrimSpace(strings.ToLower(arc)) { + case zipArc: + arr = append(arr, ArcZipTypes()...) + case tarArc: + arr = append(arr, ArcTarTypes()...) + case gzipArc: + arr = append(arr, ArcGzipTypes()...) + case warcArc: + arr = append(arr, ArcWarcTypes()...) + case arcArc: + arr = append(arr, ArcArcTypes()...) + } + } + permissiveFilter = arr + return arr +} + +// archiveFilterPermissive provides a getter for the configured +// zip-types we want to extract and identify the contents of with +// Siegfried. +func archiveFilterPermissive() []string { + return permissiveFilter +} + +func (archive Archive) String() string { + switch archive { case Zip: return "zip" case Gzip: diff --git a/pkg/config/identifier.go b/pkg/config/identifier.go index 7dd6df68c..6c01f7d41 100644 --- a/pkg/config/identifier.go +++ b/pkg/config/identifier.go @@ -287,6 +287,9 @@ func contains(v string, s []string) bool { // IsArchive returns an Archive that corresponds to the provided id (or none if no match). func IsArchive(id string) Archive { + if !contains(id, archiveFilterPermissive()) { + return None + } switch { case contains(id, ArcZipTypes()): return Zip diff --git a/pkg/config/identifier_test.go b/pkg/config/identifier_test.go index a29fa1edf..5a9160054 100644 --- a/pkg/config/identifier_test.go +++ b/pkg/config/identifier_test.go @@ -33,6 +33,7 @@ var nonArcUID = "fmt/1000" // arcTest defines the structure needed for our table driven testing. type arcTest struct { + filter string // The set of zip-type files to provide SetArchiveFilterPermissive(...) uid string // A UID (PUID, FDD) that identifies a zip-type file. result Archive // The anticipated result from our test. } @@ -40,19 +41,23 @@ type arcTest struct { // isArcTests provide us a slice of tests and results to loop through. var isArcTests = []arcTest{ // Positive tests should return valid Archive values. - arcTest{proZipUID, Zip}, - arcTest{mimeTarUID, Tar}, - arcTest{mimeGzipUID, Gzip}, - arcTest{mimeWarcUID, WARC}, - arcTest{locArcUID, ARC}, + arcTest{ListAllArcTypes(), proZipUID, Zip}, + arcTest{"TAR", mimeTarUID, Tar}, + arcTest{"gZip", mimeGzipUID, Gzip}, + arcTest{"warc,zip,tar", mimeWarcUID, WARC}, + arcTest{"zip,arc", locArcUID, ARC}, // Negative tests should all return None. - arcTest{nonArcUID, None}, + arcTest{"zip,arc", mimeWarcUID, None}, + arcTest{"zip,arc", mimeGzipUID, None}, + arcTest{ListAllArcTypes(), nonArcUID, None}, + arcTest{"", nonArcUID, None}, } -// TestIsArchive tests cases whether we return the correct result when -// testing whether something is an Archive. -func TestIsArchive(t *testing.T) { +// TestIsArchivePositive tests cases where the filter should return a +// positive match. +func TestIsArchivePositive(t *testing.T) { for _, test := range isArcTests { + SetArchiveFilterPermissive(test.filter) arc := IsArchive(test.uid) if arc != test.result { t.Errorf(