Skip to content

Commit

Permalink
Merge pull request #160 from nlnwa/refactor/everything
Browse files Browse the repository at this point in the history
refactor!
  • Loading branch information
maeb authored Sep 3, 2024
2 parents e864afd + a30cac0 commit ba0e02b
Show file tree
Hide file tree
Showing 131 changed files with 5,451 additions and 3,484 deletions.
8 changes: 4 additions & 4 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
*.warc.gz filter=lfs diff=lfs merge=lfs -text
testdata/nedlib/nb-image/b863a630196bce1a15ca86b40f34a2d5 filter=lfs diff=lfs merge=lfs -text
testdata/nedlib/nb-image/e4a2d28bdf4c38b8f6f291f7c8c958d5 filter=lfs diff=lfs merge=lfs -text
*.meta filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.warc filter=lfs diff=lfs merge=lfs -text
*.arc filter=lfs diff=lfs merge=lfs -text
*.meta filter=lfs diff=lfs merge=lfs -text
/testdata/** filter=lfs diff=lfs merge=lfs -text
8 changes: 8 additions & 0 deletions .github/workflows/gh-pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ jobs:
fetch-depth: 0 # Fetch all history for .GitInfo and .Lastmod
lfs: true

- name: Setup go
uses: actions/setup-go@v5
with:
go-version: stable

- name: Generate documentation
run: go generate ./docs

- name: Setup Hugo
uses: peaceiris/actions-hugo@v3
with:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Default binary
warchaeology

# Binaries for programs and plugins
*.exe
*.exe~
Expand Down
1 change: 0 additions & 1 deletion .golangci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ linters:
- containedctx
- cyclop
- depguard
- errname
- errorlint
- err113
- exhaustive
Expand Down
15 changes: 6 additions & 9 deletions .goreleaser.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
# This is an example .goreleaser.yml file with some sensible defaults.
# Make sure to check the documentation at https://goreleaser.com
# yaml-language-server: $schema=https://goreleaser.com/static/schema.json
version: 2

before:
hooks:
# You may remove this if you don't use go modules.
- go mod tidy
# you may remove this if you don't need go generate
- go generate ./...
- ./script/completions.sh

- mkdir -p completions
- sh -c "for sh in bash zsh fish; do go run . completion "$sh" > "completions/warc.$sh"; done"

builds:
- env:
Expand Down Expand Up @@ -76,7 +73,7 @@ checksum:
name_template: 'checksums.txt'

snapshot:
name_template: "{{ incpatch .Version }}-next"
version_template: "{{ incpatch .Version }}-next"

changelog:
use: github-native
Expand Down
6 changes: 4 additions & 2 deletions arcreader/arcreader.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@ func (wf *ArcFileReader) Next() (gowarc.WarcRecord, int64, *gowarc.Validation, e
}
}
wf.offset = wf.initialOffset + wf.countingReader.N() - int64(wf.bufferedReader.Buffered())
fs, _ := wf.file.Stat()
fs, err := wf.file.Stat()
if err != nil {
return nil, wf.offset, validation, err
}
if fs.Size() <= wf.offset {
wf.offset = 0
}

var err error
var recordOffset int64
wf.currentRecord, recordOffset, validation, err = wf.warcReader.Unmarshal(wf.bufferedReader)
return wf.currentRecord, wf.offset + recordOffset, validation, err
Expand Down
53 changes: 53 additions & 0 deletions arcreader/arcreader_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package arcreader

import (
"context"
"path/filepath"
"testing"

"github.com/nlnwa/gowarc"
"github.com/nlnwa/warchaeology/internal/warc"
"github.com/spf13/afero"
)

var (
testDataDir = filepath.Join("..", "testdata")
)

var testFiles = map[string]string{
// from https://archive.org/details/SAMPLE_ARC_WHITEHOUSE
"sample_arc_whitehouse": filepath.Join(testDataDir, "arc", "ARC-SAMPLE-20060928223931-00000-gojoblack.arc.gz"),
}

func TestArcReader(t *testing.T) {
tests := []struct {
name string
}{
{
name: "sample_arc_whitehouse",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
// resolve test file path
testFile, err := filepath.Abs(testFiles[test.name])
if err != nil {
t.Fatal(err)
}

arcFileReader, err := NewArcFileReader(afero.NewReadOnlyFs(afero.NewOsFs()), testFile, 0,
gowarc.WithBufferTmpDir(t.TempDir()),
gowarc.WithStrictValidation(),
)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

for record := range warc.NewIterator(context.Background(), arcFileReader, nil, 0, 0) {
if record.Err != nil {
t.Errorf("unexpected error: %v", record.Err)
}
}
})
}
}
66 changes: 34 additions & 32 deletions arcreader/unmarshaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package arcreader
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"regexp"
Expand All @@ -12,7 +13,7 @@ import (

"github.com/klauspost/compress/gzip"
"github.com/nlnwa/gowarc"
"github.com/nlnwa/warchaeology/internal"
mytime "github.com/nlnwa/warchaeology/internal/time"
)

type unmarshaler struct {
Expand All @@ -35,44 +36,45 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (gowarc.WarcRecord, int64, *gow
isGzip, r, offset, err := u.searchNextRecord(b)

defer func() {
if r != nil {
// Discarding 1 byte which makes up the end of record marker (\n)
var lf byte = '\n'
bb, e := r.Peek(4)
if len(bb) == 0 {
err = fmt.Errorf("wrong peek: %d, %v", len(bb), e)
} else {
if len(bb) != 1 || bb[0] != lf || (e != nil && e != io.EOF) {
err = fmt.Errorf("wrong peek: %d, %q, %v", len(bb), bb[0], e)
}
_, _ = r.Discard(1)
if r == nil {
return
}
// Discarding 1 byte which makes up the end of record marker (\n)
var lf byte = '\n'
bb, e := r.Peek(4)
if len(bb) == 0 {
err = fmt.Errorf("wrong peek: %d, %w", len(bb), e)
} else {
if len(bb) != 1 || bb[0] != lf || (e != nil && e != io.EOF) {
err = fmt.Errorf("wrong peek: %d, %q, %w", len(bb), bb[0], e)
}
_, _ = r.Discard(1)
}

if isGzip {
// Empty gzip reader to ensure gzip checksum is validated
b := make([]byte, 10)
var err error
for err == nil {
_, err = u.gz.Read(b)
if err == io.EOF || err == io.ErrUnexpectedEOF {
return
}
if isGzip {
// Empty gzip reader to ensure gzip checksum is validated
b := make([]byte, 10)
var err error
for err == nil {
_, err = u.gz.Read(b)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
return
}
_ = u.gz.Close()
}
_ = u.gz.Close()
}
}()

if err == io.EOF {
return nil, offset, validation, err
}
if err != nil {
return nil, offset, validation, fmt.Errorf("Could not parse ARC record: %w", err)
return nil, offset, validation, fmt.Errorf("could not parse ARC record: %w", err)
}

l, err := r.ReadString('\n')
if err != nil {
return nil, 0, nil, fmt.Errorf("Could not parse ARC record: %w", err)
return nil, 0, nil, fmt.Errorf("could not parse ARC record: %w", err)
}

var wr gowarc.WarcRecord
Expand Down Expand Up @@ -151,13 +153,13 @@ func (u *unmarshaler) parseFileHeader(r *bufio.Reader, l1 string) (gowarc.WarcRe
var read int
l2, err := r.ReadString('\n')
if err != nil {
return nil, nil, fmt.Errorf("Could not parse ARC file header")
return nil, nil, fmt.Errorf("could not parse ARC file header")
}
read += len(l2)
i := strings.IndexByte(l2, ' ')
v, err := strconv.Atoi(l2[:i])
if err != nil {
return nil, nil, fmt.Errorf("Could not parse version from ARC file header: %w", err)
return nil, nil, fmt.Errorf("could not parse version from ARC file header: %w", err)
}
u.version = v

Expand All @@ -173,12 +175,12 @@ func (u *unmarshaler) parseFileHeader(r *bufio.Reader, l1 string) (gowarc.WarcRe
return nil, nil, err
}
default:
return nil, nil, fmt.Errorf("Uknown ARC record version: %d", v)
return nil, nil, fmt.Errorf("unknown ARC record version: %d", v)
}

l3, err := r.ReadString('\n')
if err != nil {
return nil, nil, fmt.Errorf("Could not parse ARC record: %w", err)
return nil, nil, fmt.Errorf("could not parse ARC record: %w", err)
}
read += len(l3)
remaining := length - int64(read)
Expand Down Expand Up @@ -222,7 +224,7 @@ func (u *unmarshaler) parseRecord(r *bufio.Reader, l1 string) (gowarc.WarcRecord
return nil, nil, err
}
default:
return nil, nil, fmt.Errorf("Uknown ARC record version: %d", u.version)
return nil, nil, fmt.Errorf("unknown ARC record version: %d", u.version)
}

rb := gowarc.NewRecordBuilder(0, u.opts...)
Expand Down Expand Up @@ -250,19 +252,19 @@ func (u *unmarshaler) parseUrlRecordV1(l string) (gowarc.RecordType, string, str
reg := regexp.MustCompile(`([^ ]*) ([^ ]*) (\d*) ([^ ]*) (\d*)`)
subs := reg.FindStringSubmatch(l)
if subs == nil || len(subs) < 4 {
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("Could not parse ARC record from: %s", l)
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("could not parse ARC record from: %s", l)
}
url := subs[1]
ip := subs[2]
d := subs[3]
date, err := internal.From14ToTime(d)
date, err := mytime.From14ToTime(d)
if err != nil {
return 0, "", "", time.Time{}, "", 0, err
}
contentType := subs[4]
length, err := strconv.ParseInt(subs[5], 10, 64)
if err != nil {
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("Could not parse ARC record: %w", err)
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("could not parse ARC record: %w", err)
}

recordType := gowarc.Response
Expand Down
Loading

0 comments on commit ba0e02b

Please sign in to comment.