Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor! #160

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
*.warc.gz filter=lfs diff=lfs merge=lfs -text
testdata/nedlib/nb-image/b863a630196bce1a15ca86b40f34a2d5 filter=lfs diff=lfs merge=lfs -text
testdata/nedlib/nb-image/e4a2d28bdf4c38b8f6f291f7c8c958d5 filter=lfs diff=lfs merge=lfs -text
*.meta filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.warc filter=lfs diff=lfs merge=lfs -text
*.arc filter=lfs diff=lfs merge=lfs -text
*.meta filter=lfs diff=lfs merge=lfs -text
/testdata/** filter=lfs diff=lfs merge=lfs -text
8 changes: 8 additions & 0 deletions .github/workflows/gh-pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ jobs:
fetch-depth: 0 # Fetch all history for .GitInfo and .Lastmod
lfs: true

- name: Setup go
uses: actions/setup-go@v5
with:
go-version: stable

- name: Generate documentation
run: go generate ./docs

- name: Setup Hugo
uses: peaceiris/actions-hugo@v3
with:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Default binary
warchaeology

# Binaries for programs and plugins
*.exe
*.exe~
Expand Down
1 change: 0 additions & 1 deletion .golangci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ linters:
- containedctx
- cyclop
- depguard
- errname
- errorlint
- err113
- exhaustive
Expand Down
15 changes: 6 additions & 9 deletions .goreleaser.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
# This is an example .goreleaser.yml file with some sensible defaults.
# Make sure to check the documentation at https://goreleaser.com
# yaml-language-server: $schema=https://goreleaser.com/static/schema.json
version: 2

before:
hooks:
# You may remove this if you don't use go modules.
- go mod tidy
# you may remove this if you don't need go generate
- go generate ./...
- ./script/completions.sh

- mkdir -p completions
- sh -c "for sh in bash zsh fish; do go run . completion "$sh" > "completions/warc.$sh"; done"

builds:
- env:
Expand Down Expand Up @@ -76,7 +73,7 @@ checksum:
name_template: 'checksums.txt'

snapshot:
name_template: "{{ incpatch .Version }}-next"
version_template: "{{ incpatch .Version }}-next"

changelog:
use: github-native
Expand Down
6 changes: 4 additions & 2 deletions arcreader/arcreader.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@ func (wf *ArcFileReader) Next() (gowarc.WarcRecord, int64, *gowarc.Validation, e
}
}
wf.offset = wf.initialOffset + wf.countingReader.N() - int64(wf.bufferedReader.Buffered())
fs, _ := wf.file.Stat()
fs, err := wf.file.Stat()
if err != nil {
return nil, wf.offset, validation, err
}
if fs.Size() <= wf.offset {
wf.offset = 0
}

var err error
var recordOffset int64
wf.currentRecord, recordOffset, validation, err = wf.warcReader.Unmarshal(wf.bufferedReader)
return wf.currentRecord, wf.offset + recordOffset, validation, err
Expand Down
53 changes: 53 additions & 0 deletions arcreader/arcreader_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package arcreader

import (
"context"
"path/filepath"
"testing"

"github.com/nlnwa/gowarc"
"github.com/nlnwa/warchaeology/internal/warc"
"github.com/spf13/afero"
)

var (
testDataDir = filepath.Join("..", "testdata")
)

var testFiles = map[string]string{
// from https://archive.org/details/SAMPLE_ARC_WHITEHOUSE
"sample_arc_whitehouse": filepath.Join(testDataDir, "arc", "ARC-SAMPLE-20060928223931-00000-gojoblack.arc.gz"),
}

func TestArcReader(t *testing.T) {
tests := []struct {
name string
}{
{
name: "sample_arc_whitehouse",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
// resolve test file path
testFile, err := filepath.Abs(testFiles[test.name])
if err != nil {
t.Fatal(err)
}

arcFileReader, err := NewArcFileReader(afero.NewReadOnlyFs(afero.NewOsFs()), testFile, 0,
gowarc.WithBufferTmpDir(t.TempDir()),
gowarc.WithStrictValidation(),
)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

for record := range warc.NewIterator(context.Background(), arcFileReader, nil, 0, 0) {
if record.Err != nil {
t.Errorf("unexpected error: %v", record.Err)
}
}
})
}
}
66 changes: 34 additions & 32 deletions arcreader/unmarshaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package arcreader
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"regexp"
Expand All @@ -12,7 +13,7 @@ import (

"github.com/klauspost/compress/gzip"
"github.com/nlnwa/gowarc"
"github.com/nlnwa/warchaeology/internal"
mytime "github.com/nlnwa/warchaeology/internal/time"
)

type unmarshaler struct {
Expand All @@ -35,44 +36,45 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (gowarc.WarcRecord, int64, *gow
isGzip, r, offset, err := u.searchNextRecord(b)

defer func() {
if r != nil {
// Discarding 1 byte which makes up the end of record marker (\n)
var lf byte = '\n'
bb, e := r.Peek(4)
if len(bb) == 0 {
err = fmt.Errorf("wrong peek: %d, %v", len(bb), e)
} else {
if len(bb) != 1 || bb[0] != lf || (e != nil && e != io.EOF) {
err = fmt.Errorf("wrong peek: %d, %q, %v", len(bb), bb[0], e)
}
_, _ = r.Discard(1)
if r == nil {
return
}
// Discarding 1 byte which makes up the end of record marker (\n)
var lf byte = '\n'
bb, e := r.Peek(4)
if len(bb) == 0 {
err = fmt.Errorf("wrong peek: %d, %w", len(bb), e)
} else {
if len(bb) != 1 || bb[0] != lf || (e != nil && e != io.EOF) {
err = fmt.Errorf("wrong peek: %d, %q, %w", len(bb), bb[0], e)
}
_, _ = r.Discard(1)
}

if isGzip {
// Empty gzip reader to ensure gzip checksum is validated
b := make([]byte, 10)
var err error
for err == nil {
_, err = u.gz.Read(b)
if err == io.EOF || err == io.ErrUnexpectedEOF {
return
}
if isGzip {
// Empty gzip reader to ensure gzip checksum is validated
b := make([]byte, 10)
var err error
for err == nil {
_, err = u.gz.Read(b)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
return
}
_ = u.gz.Close()
}
_ = u.gz.Close()
}
}()

if err == io.EOF {
return nil, offset, validation, err
}
if err != nil {
return nil, offset, validation, fmt.Errorf("Could not parse ARC record: %w", err)
return nil, offset, validation, fmt.Errorf("could not parse ARC record: %w", err)
}

l, err := r.ReadString('\n')
if err != nil {
return nil, 0, nil, fmt.Errorf("Could not parse ARC record: %w", err)
return nil, 0, nil, fmt.Errorf("could not parse ARC record: %w", err)
}

var wr gowarc.WarcRecord
Expand Down Expand Up @@ -151,13 +153,13 @@ func (u *unmarshaler) parseFileHeader(r *bufio.Reader, l1 string) (gowarc.WarcRe
var read int
l2, err := r.ReadString('\n')
if err != nil {
return nil, nil, fmt.Errorf("Could not parse ARC file header")
return nil, nil, fmt.Errorf("could not parse ARC file header")
}
read += len(l2)
i := strings.IndexByte(l2, ' ')
v, err := strconv.Atoi(l2[:i])
if err != nil {
return nil, nil, fmt.Errorf("Could not parse version from ARC file header: %w", err)
return nil, nil, fmt.Errorf("could not parse version from ARC file header: %w", err)
}
u.version = v

Expand All @@ -173,12 +175,12 @@ func (u *unmarshaler) parseFileHeader(r *bufio.Reader, l1 string) (gowarc.WarcRe
return nil, nil, err
}
default:
return nil, nil, fmt.Errorf("Uknown ARC record version: %d", v)
return nil, nil, fmt.Errorf("unknown ARC record version: %d", v)
}

l3, err := r.ReadString('\n')
if err != nil {
return nil, nil, fmt.Errorf("Could not parse ARC record: %w", err)
return nil, nil, fmt.Errorf("could not parse ARC record: %w", err)
}
read += len(l3)
remaining := length - int64(read)
Expand Down Expand Up @@ -222,7 +224,7 @@ func (u *unmarshaler) parseRecord(r *bufio.Reader, l1 string) (gowarc.WarcRecord
return nil, nil, err
}
default:
return nil, nil, fmt.Errorf("Uknown ARC record version: %d", u.version)
return nil, nil, fmt.Errorf("unknown ARC record version: %d", u.version)
}

rb := gowarc.NewRecordBuilder(0, u.opts...)
Expand Down Expand Up @@ -250,19 +252,19 @@ func (u *unmarshaler) parseUrlRecordV1(l string) (gowarc.RecordType, string, str
reg := regexp.MustCompile(`([^ ]*) ([^ ]*) (\d*) ([^ ]*) (\d*)`)
subs := reg.FindStringSubmatch(l)
if subs == nil || len(subs) < 4 {
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("Could not parse ARC record from: %s", l)
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("could not parse ARC record from: %s", l)
}
url := subs[1]
ip := subs[2]
d := subs[3]
date, err := internal.From14ToTime(d)
date, err := mytime.From14ToTime(d)
if err != nil {
return 0, "", "", time.Time{}, "", 0, err
}
contentType := subs[4]
length, err := strconv.ParseInt(subs[5], 10, 64)
if err != nil {
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("Could not parse ARC record: %w", err)
return 0, "", "", time.Time{}, "", 0, fmt.Errorf("could not parse ARC record: %w", err)
}

recordType := gowarc.Response
Expand Down
Loading
Loading