Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Demonstration of Software output via Siegfried YAML #152

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/config/internal/wikidatasparql/sparql.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ var wikidataLang = "en"
const sparql = `
# Return all file format records from Wikidata.
#
select distinct ?uri ?uriLabel ?puid ?extension ?mimetype ?encodingLabel ?referenceLabel ?date ?relativityLabel ?offset ?sig
select distinct ?uri ?uriLabel ?puid ?extension ?mimetype ?encodingLabel ?referenceLabel ?date ?relativityLabel ?offset ?sig ?software ?softwareLabel
where
{
?uri wdt:P31/wdt:P279* wd:Q235557. # Return records of type File Format.
Expand All @@ -55,6 +55,7 @@ const sparql = `
}
}
}
optional { ?software wdt:P1072 ?uri. }
service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE], <<lang>>". }
}
order by ?uri
Expand Down
2 changes: 1 addition & 1 deletion pkg/config/wikidata.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ var wikidata = struct {
gzip: "Q27824060",
tar: "Q283579",
warc: "Q10287816",
definitions: "wikidata-definitions-1.0.0",
definitions: "wikidata-definitions-2.x.x",
endpoint: "https://query.wikidata.org/sparql",
filemode: 0644,
sourcefield: true,
Expand Down
32 changes: 23 additions & 9 deletions pkg/wikidata/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,15 @@ func (i *Identifier) Recorder() core.Recorder {
// multiple, per file. The identification to the user looks something like as
// follows:
//
// - ns : 'wikidata'
// id : 'Q1343830'
// format : 'Executable and Linkable Format'
// URI : 'http://www.wikidata.org/entity/Q1343830'
// mime :
// basis : 'byte match at 0, 4 (signature 1/5); byte match at 0, 7 (signature 4/5)'
// source : 'Gary Kessler''s File Signature Table (source date: 2017-08-08) PRONOM (Official (fmt/689))'
// warning :
// - ns : 'wikidata'
// id : 'Q1343830'
// format : 'Executable and Linkable Format'
// URI : 'http://www.wikidata.org/entity/Q1343830'
// mime :
// basis : 'byte match at 0, 4 (signature 1/5); byte match at 0, 7 (signature 4/5)'
// source : 'Gary Kessler''s File Signature Table (source date: 2017-08-08) PRONOM (Official (fmt/689))'
// warning :
// software : 'Windows 8: http://www.wikidata.org/entity/Q5046'
//
type Identification struct {
Namespace string // Namespace of the identifier, e.g. this will be the 'wikidata' namespace.
Expand All @@ -117,6 +118,7 @@ type Identification struct {
Warning string // Warnings generated by Siegfried.
archive config.Archive // Is it an Archive format?
confidence int // Identification confidence for sorting.
Software []string // Software compatible with the format identified.
}

// String creates a human readable representation of an identifier for output
Expand Down Expand Up @@ -159,6 +161,7 @@ func (i *Identifier) Fields() []string {
"basis",
"source",
"warning",
"software",
}
// Result field without source field. This is a little more like
// other identifiers used in Siegfried.
Expand All @@ -170,6 +173,7 @@ func (i *Identifier) Fields() []string {
"mime",
"basis",
"warning",
"software",
}
if config.GetWikidataSourceField() {
return resultsFieldsWithSource
Expand All @@ -194,13 +198,21 @@ func (id Identification) Warn() string {
return id.Warning
}

// Values returns a string slice containing each of the identifier segments.
// Values returns a string slice containing each of the identifier
// segments.The string formatting done in this function is fed directly
// to the Siegfried writer, e.g. to ensure the value in the key-value
// pair is formatted usefully.
func (id Identification) Values() []string {
var basis string
var source string
var software string
if len(id.Basis) > 0 {
basis = strings.Join(id.Basis, "; ")
}
// TODO: Separate software fields with semi-colons.
if len(id.Software) > 0 {
software = strings.Join(id.Software, "; ")
}
if config.GetWikidataSourceField() {
if len(id.Source) > 0 {
if id.Source[0] != "" {
Expand All @@ -218,6 +230,7 @@ func (id Identification) Values() []string {
basis,
source,
id.Warning,
software,
}
}
// Slice must match the order of resultsFueldsWithoutSource.
Expand All @@ -229,5 +242,6 @@ func (id Identification) Values() []string {
id.MIME,
basis,
id.Warning,
software,
}
}
1 change: 1 addition & 0 deletions pkg/wikidata/internal/mappings/wikidata_mapping_structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Wikidata struct {
Extension []string // Extension returned by Wikidata.
Mimetype []string // Mimetype as recorded by Wikidata.
Signatures []Signature // Signature associated with a record which we will convert to a new Type.
Software []string // Software that is compatible with the potential identification.
disableSignatures bool // If a bad heuristic was found we can't reliably add signatures to the record.
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/wikidata/load_wikidata.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ const encodingField = "encodingLabel"
const relativityField = "relativityLabel"
const dateField = "date"
const referenceField = "referenceLabel"
const softwareField = "software"
const softwareLabelField = "softwareLabel"

// getID returns the QID from the IRI of the record that we're
// processing.
Expand Down
2 changes: 2 additions & 0 deletions pkg/wikidata/recorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func add(matches matchIDs, id string, wikidataID string, info formatInfo, basis
Warning: "",
archive: config.IsArchive(wikidataID),
confidence: confidence,
Software: info.software,
})
}
return append(
Expand All @@ -133,6 +134,7 @@ func add(matches matchIDs, id string, wikidataID string, info formatInfo, basis
Warning: "",
archive: config.IsArchive(wikidataID),
confidence: confidence,
Software: info.software,
})
}

Expand Down
14 changes: 10 additions & 4 deletions pkg/wikidata/signature_file_operations.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func (i *Identifier) Save(ls *persist.LoadSaver) {
ls.SaveString(value.uri)
ls.SaveString(value.mime)
ls.SaveStrings(value.sources)
ls.SaveStrings(value.software)
}
i.Base.Save(ls)
}
Expand All @@ -73,6 +74,7 @@ func Load(ls *persist.LoadSaver) core.Identifier {
ls.LoadString(), // URI.
ls.LoadString(), // mime.
ls.LoadStrings(), // sources.
ls.LoadStrings(), // software.
}
}
i.Base = identifier.Load(ls)
Expand All @@ -97,6 +99,9 @@ type formatInfo struct {
// sources describes the source of a signature retrieved from
// Wikidata.
sources []string
// software describes programs that are compatible with any
// potential identification.
software []string
}

// infos turns the generic formatInfo into the structure that will be
Expand Down Expand Up @@ -155,10 +160,11 @@ func (wdd wikidataDefinitions) Infos() parseableFormatInfo {
}
sources := prepareSources(value)
fi := formatInfo{
name: value.Name,
uri: value.URI,
mime: mime,
sources: sources,
name: value.Name,
uri: value.URI,
mime: mime,
sources: sources,
software: value.Software,
}
formatInfoMap[value.ID] = fi
}
Expand Down
23 changes: 23 additions & 0 deletions pkg/wikidata/wikidata_process_records.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
package wikidata

import (
"fmt"

"github.com/richardlehane/siegfried/pkg/wikidata/internal/mappings"

"github.com/ross-spencer/spargo/pkg/spargo"
Expand All @@ -27,6 +29,18 @@ import (
// wikidataRecord provides an alias for the mappings.Wikidata object.
type wikidataRecord = mappings.Wikidata

// makeSoftware() creates a key-value pairing of software label and IRI.
//
// TODO: It would be "purer" to keep these apart, and so I might
// investigate that depending on other ideas around this PR.
func makeSoftware(wikidataItem map[string]spargo.Item) string {
return fmt.Sprintf(
"%s: %s",
wikidataItem[softwareLabelField].Value,
wikidataItem[softwareField].Value,
)
}

// newRecord creates a Wikidata record with the values received from
// Wikidata itself.
func newRecord(wikidataItem map[string]spargo.Item, addSigs bool) wikidataRecord {
Expand Down Expand Up @@ -57,6 +71,9 @@ func newRecord(wikidataItem map[string]spargo.Item, addSigs bool) wikidataRecord
wd.Signatures[0].ByteSequences = append(
wd.Signatures[0].ByteSequences, bs)
}
if wikidataItem[softwareField].Value != "" {
wd.Software = append(wd.Software, makeSoftware(wikidataItem))
}
return wd
}

Expand All @@ -73,6 +90,12 @@ func updateRecord(wikidataItem map[string]spargo.Item, wd wikidataRecord) wikida
if contains(wd.Mimetype, wikidataItem[mimeField].Value) == false {
wd.Mimetype = append(wd.Mimetype, wikidataItem[mimeField].Value)
}
software := makeSoftware(wikidataItem)
if contains(wd.Software, software) == false {
if wikidataItem[softwareField].Value != "" {
wd.Software = append(wd.Software, software)
}
}
if wikidataItem[signatureField].Value != "" {
if !wd.SignaturesDisabled() {
lintingErr := updateSequences(wikidataItem, &wd)
Expand Down
92 changes: 85 additions & 7 deletions pkg/writer/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,12 @@ func (c *csvWriter) File(name string, sz int64, mod string, checksum []byte, err
func (c *csvWriter) Tail() { c.w.Flush() }

type yamlWriter struct {
replacer *strings.Replacer
w *bufio.Writer
hh string
hstrs []string
vals [][]interface{}
replacer *strings.Replacer
w *bufio.Writer
hh string
hstrs []string
vals [][]interface{}
complexData []int
}

func YAML(w io.Writer) Writer {
Expand Down Expand Up @@ -164,6 +165,38 @@ func header(fields []string) string {
return " - " + strings.Join(headings, " : %v\n ") + " : %v\n"
}

// Consts to lookup for Wikidata.
//
// TODO: This is just for demonstration purposes. The solution would be
// better eventually and would be driven by the caller, not through
// specific exceptions within the Writer.
//
const sfw = "software"

// getComplexFieldds will mark the fields we want to output in YAML as
// more than a flat string value, e.g. a list, or set of key-values.
//
// TODO: This is just for demonstration purposes. The solution would be
// better eventually and would be driven by the caller, not through
// specific exceptions within the Writer.
//
func (y *yamlWriter) getComplexFields(headers [][]string) {
var fields int = 1
var idxSoftware int
for _, header := range headers {
for idx, field := range header {
if field == sfw {
idxSoftware = idx
}
}
if idxSoftware > 0 {
break
}
}
y.complexData = make([]int, fields)
y.complexData[0] = idxSoftware
}

func (y *yamlWriter) Head(path string, scanned, created time.Time, version [3]int, ids [][2]string, fields [][]string, hh string) {
y.hh = hh
y.hstrs = make([]string, len(fields))
Expand All @@ -181,6 +214,36 @@ func (y *yamlWriter) Head(path string, scanned, created time.Time, version [3]in
for _, id := range ids {
fmt.Fprintf(y.w, " - name : '%v'\n details : '%v'\n", id[0], id[1])
}
y.getComplexFields(fields)
}

// inSlice simply checks for the existence of an integer in an integer
// slice, e.g. Field indices.
func inSlice(needle int, haystack []int) bool {
for _, val := range haystack {
if needle == val {
return true
}
}
return false
}

// formatComplexType helps us to create a nested array/dictionary/list
// in the YAML output.
//
// TODO: We can definitely add some testing around this...
func formatYamlComplexType(val string) string {
const spacing = " "
s := strings.Split(val, ";")
var new string
for _, a := range s {
if new == "" {
new = fmt.Sprintf("\n%s%s%s\n%s%s", spacing, spacing, strings.Trim(a, " "), spacing, spacing)
} else {
new = fmt.Sprintf("%s%s\n%s%s", new, strings.Trim(a, " "), spacing, spacing)
}
}
return strings.TrimRight(new, "\n ")
}

func (y *yamlWriter) File(name string, sz int64, mod string, checksum []byte, err error, ids []core.Identification) {
Expand All @@ -196,7 +259,15 @@ func (y *yamlWriter) File(name string, sz int64, mod string, checksum []byte, er
if checksum != nil {
h = fmt.Sprintf("%-8s : %s\n", y.hh, hex.EncodeToString(checksum))
}
fmt.Fprintf(y.w, "---\nfilename : '%s'\nfilesize : %d\nmodified : %s\nerrors : %s\n%smatches :\n", y.replacer.Replace(name), sz, mod, errStr, h)
fmt.Fprintf(
y.w,
"---\nfilename : '%s'\nfilesize : %d\nmodified : %s\nerrors : %s\n%smatches :\n",
y.replacer.Replace(name),
sz,
mod,
errStr,
h,
)
for _, id := range ids {
values := id.Values()
if values[0] != thisName {
Expand All @@ -208,7 +279,14 @@ func (y *yamlWriter) File(name string, sz int64, mod string, checksum []byte, er
y.vals[idx][i] = ""
continue
}
y.vals[idx][i] = "'" + y.replacer.Replace(v) + "'"
// TODO: As per-above exceptions, e.g. those for complex
// processing should be driven upstream with the correct
// signaling.
if inSlice(i, y.complexData) {
y.vals[idx][i] = formatYamlComplexType(v)
} else {
y.vals[idx][i] = fmt.Sprintf("'%s'", y.replacer.Replace(v))
}
}
fmt.Fprintf(y.w, y.hstrs[idx], y.vals[idx]...)
}
Expand Down