Skip to content

Commit

Permalink
Two enhancements to digests as suggested by the WARC 1.1 community re…
Browse files Browse the repository at this point in the history
…commendation #80

https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1-annotated/
* Accept alternative names for checksum algorithms: sha-1 = sha1, sha-256 = sha256 (#71)
* Follow recommended casing for base16 and base32 encoding of checksums
  • Loading branch information
johnerikhalse authored Jan 19, 2024
1 parent b976c17 commit 93e6bd1
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 70 deletions.
28 changes: 14 additions & 14 deletions block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (

func Test_genericBlock_BlockDigest(t *testing.T) {
content := "foo"
digest := "sha1:0BEEC7B5EA3F0FDBC95D0DD47F3C5BC275DA8A33"
digest := "sha1:0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -57,7 +57,7 @@ func Test_genericBlock_BlockDigest(t *testing.T) {

func Test_genericBlock_Cache(t *testing.T) {
content := "foo"
digest := "sha1:0BEEC7B5EA3F0FDBC95D0DD47F3C5BC275DA8A33"
digest := "sha1:0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"

tests := []cacheTest{
{
Expand Down Expand Up @@ -126,7 +126,7 @@ func Test_genericBlock_IsCached(t *testing.T) {

func Test_genericBlock_RawBytes(t *testing.T) {
content := "foo"
digest := "sha1:0BEEC7B5EA3F0FDBC95D0DD47F3C5BC275DA8A33"
digest := "sha1:0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"

tests := []rawBytesTest{
{
Expand Down Expand Up @@ -158,7 +158,7 @@ func Test_genericBlock_RawBytes(t *testing.T) {

func Test_warcfieldsBlock_BlockDigest(t *testing.T) {
content := "foo: bar\r\ncontent-type:bb\r\n"
digest := "sha1:A1D43D400C5985BEE035C4E5A2E08F3D57989596"
digest := "sha1:a1d43d400c5985bee035c4e5a2e08f3d57989596"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -191,7 +191,7 @@ func Test_warcfieldsBlock_BlockDigest(t *testing.T) {

func Test_warcfieldsBlock_Cache(t *testing.T) {
content := "foo: bar\r\ncontent-type:bb\r\n"
digest := "sha1:A1D43D400C5985BEE035C4E5A2E08F3D57989596"
digest := "sha1:a1d43d400c5985bee035c4e5a2e08f3d57989596"

tests := []cacheTest{
{
Expand Down Expand Up @@ -272,7 +272,7 @@ func Test_warcfieldsBlock_IsCached(t *testing.T) {

func Test_warcfieldsBlock_RawBytes(t *testing.T) {
content := "foo: bar\r\ncontent-type:bb\r\n"
digest := "sha1:A1D43D400C5985BEE035C4E5A2E08F3D57989596"
digest := "sha1:a1d43d400c5985bee035c4e5a2e08f3d57989596"

tests := []rawBytesTest{
{
Expand Down Expand Up @@ -313,8 +313,8 @@ func Test_httpRequestBlock_BlockDigest(t *testing.T) {
"Referer: http://example.com/foo.html\n" +
"Connection: close\n" +
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n\n"
digest := "sha1:A3781FF1FC3FB52318F623E22C85D63D74C12932"
payloadDigest := "sha1:DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"
digest := "sha1:a3781ff1fc3fb52318f623e22c85d63d74c12932"
payloadDigest := "sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -354,7 +354,7 @@ func Test_httpRequestBlock_Cache(t *testing.T) {
"Referer: http://example.com/foo.html\n" +
"Connection: close\n" +
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n\n"
digest := "sha1:A3781FF1FC3FB52318F623E22C85D63D74C12932"
digest := "sha1:a3781ff1fc3fb52318f623e22c85d63d74c12932"

tests := []cacheTest{
{
Expand Down Expand Up @@ -443,7 +443,7 @@ func Test_httpRequestBlock_RawBytes(t *testing.T) {
"Referer: http://example.com/foo.html\n" +
"Connection: close\n" +
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n\n"
digest := "sha1:A3781FF1FC3FB52318F623E22C85D63D74C12932"
digest := "sha1:a3781ff1fc3fb52318f623e22c85d63d74c12932"

tests := []rawBytesTest{
{
Expand Down Expand Up @@ -482,8 +482,8 @@ func Test_httpResponseBlock_BlockDigest(t *testing.T) {
content := "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"
digest := "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"
payloadDigest := "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"
digest := "sha1:b285747ad7cc57aa74bce2e30b453c8d1cb71ba4"
payloadDigest := "sha1:c37ffb221569c553a2476c22c7dad429f3492977"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -520,7 +520,7 @@ func Test_httpResponseBlock_Cache(t *testing.T) {
content := "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"
digest := "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"
digest := "sha1:b285747ad7cc57aa74bce2e30b453c8d1cb71ba4"

tests := []cacheTest{
{
Expand Down Expand Up @@ -603,7 +603,7 @@ func Test_httpResponseBlock_RawBytes(t *testing.T) {
content := "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"
digest := "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"
digest := "sha1:b285747ad7cc57aa74bce2e30b453c8d1cb71ba4"

tests := []rawBytesTest{
{
Expand Down
49 changes: 41 additions & 8 deletions digest.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package gowarc

import (
"bytes"
"crypto/md5"
"crypto/sha1"
"crypto/sha256"
Expand All @@ -36,7 +37,7 @@ func (d digestEncoding) encode(digest *digest) string {
dig := digest.Sum(nil)
switch d {
case Base16:
return strings.ToUpper(hex.EncodeToString(dig))
return strings.ToLower(hex.EncodeToString(dig))
case Base32:
return base32.StdEncoding.EncodeToString(dig)
case Base64:
Expand All @@ -46,6 +47,19 @@ func (d digestEncoding) encode(digest *digest) string {
}
}

func (d digestEncoding) decode(s string) ([]byte, error) {
switch d {
case Base16:
return hex.DecodeString(s)
case Base32:
return base32.StdEncoding.DecodeString(s)
case Base64:
return base64.StdEncoding.DecodeString(s)
default:
return []byte(s), nil
}
}

const (
unknown digestEncoding = 0
Base16 digestEncoding = 1
Expand Down Expand Up @@ -85,6 +99,22 @@ func detectEncoding(algorithm, digest string, defaultEncoding digestEncoding) di
return defaultEncoding
}

// normalizeAlgorithmName normalizes the algorithm name to the format used in WARC digest-fields.
func normalizeAlgorithmName(algorithm string) string {
algorithm = strings.ToLower(algorithm)

switch algorithm {
case "sha-1":
return "sha1"
case "sha-256":
return "sha256"
case "sha-512":
return "sha512"
default:
return algorithm
}
}

// digest is a utility for parsing, creation and validation of WARC block and payload digests.
//
// Typical usage is to create a digest from a WARC record's WARC-Block-Digest or WARC-Payload-Digest fields.
Expand Down Expand Up @@ -124,7 +154,11 @@ func (d *digest) format() string {
// digest.
func (d *digest) validate() error {
computed := d.encoding.encode(d)
if d.hash != computed {
dig, err := d.encoding.decode(d.hash)
if err != nil {
return err
}
if !bytes.Equal(dig, d.Sum(nil)) {
return fmt.Errorf("wrong digest: expected %s:%s, computed: %s:%s", d.name, d.hash, d.name, computed)
}
return nil
Expand All @@ -144,17 +178,16 @@ func (d *digest) updateDigest() {
func newDigest(digestString string, defaultEncoding digestEncoding) (*digest, error) {
t := strings.SplitN(digestString, ":", 2)
algorithm := t[0]
algorithm = strings.ToLower(algorithm)
if algorithm == "" {
return nil, fmt.Errorf("missing algorithm")
}
algorithm = normalizeAlgorithmName(algorithm)
var hash string
if len(t) > 1 {
hash = t[1]
}
encoding := detectEncoding(algorithm, hash, defaultEncoding)
if encoding < Base64 {
// base16 and base32 encodings are case insensitive.
switch encoding {
case Base16:
hash = strings.ToLower(hash)
case Base32:
hash = strings.ToUpper(hash)
}

Expand Down
41 changes: 25 additions & 16 deletions digest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,25 @@ func Test_newDigest(t *testing.T) {
wantDigest string
wantErr bool
}{
{"md5", "md5", "Some content", Base16, "md5", "md5:B53227DA4280F0E18270F21DD77C91D0", false},
{"md5 with base16 digest", "md5:12345", "Some content", Base16, "md5", "md5:B53227DA4280F0E18270F21DD77C91D0", false},
{"md5", "md5", "Some content", Base16, "md5", "md5:b53227da4280f0e18270f21dd77c91d0", false},
{"md5 with base16 digest", "md5:12345", "Some content", Base16, "md5", "md5:b53227da4280f0e18270f21dd77c91d0", false},
{"md5 with base32 digest", "md5:12345", "Some content", Base32, "md5", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A======", false},
{"md5 with base64 digest", "md5:12345", "Some content", Base64, "md5", "md5:tTIn2kKA8OGCcPId13yR0A==", false},
{"sha1", "sha1", "Some content", Base16, "sha1", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", false},
{"sha1 with base16 digest", "sha1:12345", "Some content", Base16, "sha1", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", false},
{"sha1", "sha1", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha1 with base16 digest", "sha1:12345", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha-1 with base16 digest", "sha-1:12345", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha1 with base32 digest", "sha1:12345", "Some content", Base32, "sha1", "sha1:T4NG5T3U5H43DLSS5DVVQHKCBZR6QRJ2", false},
{"sha1 with base64 digest", "sha1:12345", "Some content", Base64, "sha1", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo=", false},
{"sha256", "sha256", "Some content", Base16, "sha256", "sha256:9C6609FC5111405EA3F5BB3D1F6B5A5EFD19A0CEC53D85893FD96D265439CD5B", false},
{"sha256 with base16 digest", "sha256:12345", "Some content", Base16, "sha256", "sha256:9C6609FC5111405EA3F5BB3D1F6B5A5EFD19A0CEC53D85893FD96D265439CD5B", false},
{"sha256", "sha256", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha-256", "sha256", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha256 with base16 digest", "sha256:12345", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha256 with base32 digest", "sha256:12345", "Some content", Base32, "sha256", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ====", false},
{"sha256 with base64 digest", "sha256:12345", "Some content", Base64, "sha256", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs=", false},
{"sha512", "sha512", "Some content", Base16, "sha512", "sha512:B20D977718ED67F2BF7620EE2D982FD850C4883EC8D048440FE7B6A86CF6322FD791C47B0C7469DBEEF3E339032E1ABC4BCEBE5EFC104BC19A117BFEF4478605", false},
{"sha512 with base16 digest", "sha512:12345", "Some content", Base16, "sha512", "sha512:B20D977718ED67F2BF7620EE2D982FD850C4883EC8D048440FE7B6A86CF6322FD791C47B0C7469DBEEF3E339032E1ABC4BCEBE5EFC104BC19A117BFEF4478605", false},
{"sha512", "sha512", "Some content", Base16, "sha512", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", false},
{"sha512 with base16 digest", "sha512:12345", "Some content", Base16, "sha512", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", false},
{"sha512 with base32 digest", "sha512:12345", "Some content", Base32, "sha512", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI=", false},
{"sha512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", false},
{"sha-512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", false},
{"unknown algorithm", "mysecret:12345", "Some content", Base16, "mysecret", "mysecret:123", true},
{"unknown algorithm with digest", "mysecret:12345", "Some content", Base16, "mysecret", "mysecret:123", true},
}
Expand Down Expand Up @@ -81,42 +84,48 @@ func Test_digest_validate(t *testing.T) {
wantValid bool
}{
{"md5", "Some content", "md5", false},
{"md5 with base16 digest", "Some content", "md5:B53227DA4280F0E18270F21DD77C91D0", true},
{"md5 with base16 digest", "Some content", "md5:b53227da4280f0e18270f21dd77c91d0", true},
{"md5 with base32 digest", "Some content", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A======", true},
{"md5 with base64 digest", "Some content", "md5:tTIn2kKA8OGCcPId13yR0A==", true},
{"md5 with wrong digest", "Some content", "md5:123", false},
{"sha1", "Some content", "sha1", false},
{"sha1 with base16 digest", "Some content", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", true},
{"sha1 with base16 digest", "Some content", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"SHA-1 with base16 digest", "Some content", "SHA-1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"sha1 with base32 digest", "Some content", "sha1:T4NG5T3U5H43DLSS5DVVQHKCBZR6QRJ2", true},
{"sha1 with base64 digest", "Some content", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo=", true},
{"sha1 with wrong digest", "Some content", "sha1:123", false},
{"sha256", "Some content", "sha256", false},
{"sha256 with base16 digest", "Some content", "sha256:9C6609FC5111405EA3F5BB3D1F6B5A5EFD19A0CEC53D85893FD96D265439CD5B", true},
{"sha256 with base16 digest", "Some content", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", true},
{"SHA-256 with base16 digest", "Some content", "SHA-256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", true},
{"sha256 with base32 digest", "Some content", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ====", true},
{"sha256 with base64 digest", "Some content", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs=", true},
{"sha256 with wrong digest", "Some content", "sha256:123", false},
{"sha512", "Some content", "sha512", false},
{"sha512 with base16 digest", "Some content", "sha512:B20D977718ED67F2BF7620EE2D982FD850C4883EC8D048440FE7B6A86CF6322FD791C47B0C7469DBEEF3E339032E1ABC4BCEBE5EFC104BC19A117BFEF4478605", true},
{"sha512 with base16 digest", "Some content", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", true},
{"sha512 with base32 digest", "Some content", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI=", true},
{"sha512 with base64 digest", "Some content", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", true},
{"sha512 with wrong digest", "Some content", "sha512:123", false},
{"lovercase base16 encoding", "Some content", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"uppercase base16 encoding", "Some content", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", true},
{"lovercase base32 encoding", "Some content", "sha1:t4ng5t3u5h43dlss5dvvqhkcbzr6qrj2", true},
{"lovercase base64 encoding", "Some content", "sha1:nxpuz3tp+bguuujrwb1cdmporto=", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d, _ := newDigest(tt.digestString, unknown)

assert := assert.New(t)
_, err := d.Write([]byte(tt.input))

d, err := newDigest(tt.digestString, unknown)
assert.NoError(err)
assert.NotNil(d)

_, err = d.Write([]byte(tt.input))
assert.NoError(err)

err = d.validate()
if !tt.wantValid {
assert.Error(err)
} else {
assert.NoError(err)
//assert.Equal(tt.digestString, d.format())
}
})
}
Expand Down
4 changes: 2 additions & 2 deletions example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func ExampleUnmarshaler() {
"WARC-Filename: temp-20170306040353.warc.gz\r\n" +
"WARC-Type: warcinfo\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Warc-Block-Digest: sha1:AF4D582B4FFC017D07A947D841E392A821F754F3\r\n" +
"Warc-Block-Digest: sha1:af4d582b4ffc017d07a947d841e392a821f754f3\r\n" +
"Content-Length: 34\r\n" +
"\r\n" +
"format: WARC File Format 1.1\r\n" +
Expand All @@ -68,7 +68,7 @@ func ExampleUnmarshaler() {
// Output: Offset: 2, WARC record: version: WARC/1.1, type: warcinfo, id: urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008
// gowarc: Validation errors:
// 1: gowarc: record was found 2 bytes after expected offset
// 2: block: wrong digest: expected sha1:AF4D582B4FFC017D07A947D841E392A821F754F3, computed: sha1:8A936F9FD60D664CF95B1FFB40F1C4093E65BB40
// 2: block: wrong digest: expected sha1:af4d582b4ffc017d07a947d841e392a821f754f3, computed: sha1:8a936f9fd60d664cf95b1ffb40f1c4093e65bb40
}

func ExampleNewWarcFileWriter() {
Expand Down
Loading

0 comments on commit 93e6bd1

Please sign in to comment.