Skip to content

Commit

Permalink
Http block disable validate (#72)
Browse files Browse the repository at this point in the history
Let block validation be optional
  • Loading branch information
johnerikhalse authored Jan 24, 2024
1 parent 93e6bd1 commit 69ba622
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 13 deletions.
15 changes: 8 additions & 7 deletions httpblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"errors"
"fmt"
"io"
"io/ioutil"
"net/http"

"github.com/nlnwa/gowarc/internal/diskbuffer"
Expand Down Expand Up @@ -104,7 +103,7 @@ func (block *httpRequestBlock) BlockDigest() string {
if block.filterReader == nil {
block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest)
}
_, _ = io.Copy(ioutil.Discard, block.filterReader)
_, _ = io.Copy(io.Discard, block.filterReader)
block.blockDigestString = block.blockDigest.format()
block.payloadDigestString = block.payloadDigest.format()
}
Expand Down Expand Up @@ -239,7 +238,7 @@ func (block *httpResponseBlock) BlockDigest() string {
if block.filterReader == nil {
block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest)
}
_, _ = io.Copy(ioutil.Discard, block.filterReader)
_, _ = io.Copy(io.Discard, block.filterReader)
block.blockDigestString = block.blockDigest.format()
block.payloadDigestString = block.payloadDigest.format()
}
Expand Down Expand Up @@ -401,8 +400,9 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig
// We have to fix the header for parsing even if we don't fix the record
hb = append(hb, '\r', '\n')
}
if err := resp.parseHeaders(hb); err != nil && opts.errSyntax > ErrIgnore {
if opts.errSyntax == ErrWarn {
if err := resp.parseHeaders(hb); err != nil && opts.errBlock > ErrIgnore {
err = fmt.Errorf("error in http response block: %w", err)
if opts.errBlock == ErrWarn {
validation.addError(err)
} else {
return resp, err
Expand All @@ -422,8 +422,9 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig
// We have to fix the header for parsing even if we don't fix the record
hb = append(hb, '\r', '\n')
}
if err := resp.parseHeaders(hb); err != nil && opts.errSyntax > ErrIgnore {
if opts.errSyntax == ErrWarn {
if err := resp.parseHeaders(hb); err != nil && opts.errBlock > ErrIgnore {
err = fmt.Errorf("error in http request block: %w", err)
if opts.errBlock == ErrWarn {
validation.addError(err)
} else {
return resp, err
Expand Down
15 changes: 14 additions & 1 deletion options.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type warcRecordOptions struct {
errSyntax errorPolicy
errSpec errorPolicy
errUnknownRecordType errorPolicy
errBlock errorPolicy
skipParseBlock bool
addMissingRecordId bool
recordIdFunc func() (string, error)
Expand Down Expand Up @@ -82,6 +83,7 @@ func defaultWarcRecordOptions() warcRecordOptions {
errSyntax: ErrWarn,
errSpec: ErrWarn,
errUnknownRecordType: ErrWarn,
errBlock: ErrIgnore,
skipParseBlock: false,
addMissingRecordId: true,
recordIdFunc: defaultIdGenerator,
Expand Down Expand Up @@ -141,6 +143,17 @@ func WithUnknownRecordTypePolicy(policy errorPolicy) WarcRecordOption {
})
}

// WithBlockErrorPolicy sets the policy for handling errors in block parsing.
//
// For most records this is the content fetched from the original source and errors here should be ignored.
//
// defaults to ErrIgnore
func WithBlockErrorPolicy(policy errorPolicy) WarcRecordOption {
return newFuncWarcRecordOption(func(o *warcRecordOptions) {
o.errBlock = policy
})
}

// WithAddMissingRecordId sets if missing WARC-Record-ID header should be generated.
//
// defaults to true
Expand Down Expand Up @@ -239,7 +252,7 @@ func WithFixSyntaxErrors(fixSyntaxErrors bool) WarcRecordOption {

// WithFixWarcFieldsBlockErrors sets if an attempt to fix syntax errors in warcfields block should be done when those are detected.
//
// # This will not have any impact if SyntaxErrorPolicy is ErrIgnore
// A warcfields block is typically generated by a web crawler. An error in this context suggests a potential bug in the crawler's WARC writer.
//
// defaults to false
func WithFixWarcFieldsBlockErrors(fixWarcFieldsBlockErrors bool) WarcRecordOption {
Expand Down
94 changes: 93 additions & 1 deletion unmarshaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,97 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
WithAddMissingRecordId(false),
WithFixContentLength(false),
},
"WARC/1.0\r\n" +
"WARC-Date: 2017-03-06T04:03:53Z\r\n" +
"WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\n" +
"WARC-Type: metadata\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Content-Length: 18\r\n" +
"WARC-Block-Digest: sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY\r\n" +
"\r\n" +
"foo: bar\n" +
"food:bar\n" +
"\r\n" +
"\r\n",
want{
V1_0,
Metadata,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "metadata"},
&nameValue{Name: ContentType, Value: "application/warc-fields"},
&nameValue{Name: ContentLength, Value: "18"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY"},
},
&warcFieldsBlock{},
"foo: bar\nfood:bar\n",
&Validation{},
true,
},
0,
false,
},
{
"metadata record missing carriage return in warc-fields block with fix syntax errors",
[]WarcRecordOption{
WithSpecViolationPolicy(ErrWarn),
WithSyntaxErrorPolicy(ErrWarn),
WithAddMissingDigest(true),
WithFixSyntaxErrors(true),
WithFixDigest(true),
WithAddMissingContentLength(false),
WithAddMissingRecordId(false),
WithFixContentLength(true),
WithFixWarcFieldsBlockErrors(true),
},
"WARC/1.0\r\n" +
"WARC-Date: 2017-03-06T04:03:53Z\r\n" +
"WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\n" +
"WARC-Type: metadata\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Content-Length: 18\r\n" +
"WARC-Block-Digest: sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY\r\n" +
"\r\n" +
"foo: bar\n" +
"food:bar\n" +
"\r\n" +
"\r\n",
want{
V1_0,
Metadata,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "metadata"},
&nameValue{Name: ContentType, Value: "application/warc-fields"},
&nameValue{Name: ContentLength, Value: "21"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:U2AN4MFP7IITXSOLYH2QTIPVDNJOHBFO"},
},
&warcFieldsBlock{},
"Foo: bar\r\nFood: bar\r\n",
&Validation{
fmt.Errorf("content length mismatch. header: 18, actual: 21"),
fmt.Errorf("block: %w", fmt.Errorf("wrong digest: expected sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY, computed: sha1:U2AN4MFP7IITXSOLYH2QTIPVDNJOHBFO")),
},
true,
},
0,
false,
},
{
"metadata record missing carriage return in warc-fields block with BlockeErrorPolicy warn",
[]WarcRecordOption{
WithSpecViolationPolicy(ErrWarn),
WithSyntaxErrorPolicy(ErrWarn),
WithBlockErrorPolicy(ErrWarn),
WithAddMissingDigest(false),
WithFixSyntaxErrors(false),
WithFixDigest(false),
WithAddMissingContentLength(false),
WithAddMissingRecordId(false),
WithFixContentLength(false),
},
"WARC/1.0\r\n" +
"WARC-Date: 2017-03-06T04:03:53Z\r\n" +
"WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\n" +
Expand Down Expand Up @@ -619,10 +710,11 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
false,
},
{
"metadata record missing carriage return in warc-fields block with fix syntax errors",
"metadata record missing carriage return in warc-fields block with fix syntax errors and BlockeErrorPolicy warn",
[]WarcRecordOption{
WithSpecViolationPolicy(ErrWarn),
WithSyntaxErrorPolicy(ErrWarn),
WithBlockErrorPolicy(ErrWarn),
WithAddMissingDigest(true),
WithFixSyntaxErrors(true),
WithFixDigest(true),
Expand Down
18 changes: 14 additions & 4 deletions warcfieldsblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func (block *warcFieldsBlock) Write(w io.Writer) (bytesWritten int64, err error)
return
}

func newWarcFieldsBlock(options *warcRecordOptions, wf *WarcFields, rb io.Reader, d *digest, validation *Validation) (WarcFieldsBlock, error) {
func newWarcFieldsBlock(options *warcRecordOptions, _ *WarcFields, rb io.Reader, d *digest, validation *Validation) (WarcFieldsBlock, error) {
wfb := &warcFieldsBlock{blockDigest: d}
var err error
wfb.content, err = io.ReadAll(rb)
Expand All @@ -95,11 +95,21 @@ func newWarcFieldsBlock(options *warcRecordOptions, wf *WarcFields, rb io.Reader
p := &warcfieldsParser{options}
blockValidation := Validation{}
wfb.warcFields, err = p.Parse(bufio.NewReader(bytes.NewReader(wfb.content)), &blockValidation, &position{})
for _, e := range blockValidation {
validation.addError(newWrappedSyntaxError("error in warc fields block", nil, e))
if options.errBlock > ErrIgnore && !blockValidation.Valid() {
switch options.errBlock {
case ErrWarn:
for _, e := range blockValidation {
validation.addError(newWrappedSyntaxError("error in warc fields block", nil, e))
}
case ErrFail:
if !blockValidation.Valid() {
err = newWrappedSyntaxError("error in warc fields block", nil, blockValidation[0])
return wfb, err
}
}
}

if !blockValidation.Valid() && options.fixWarcFieldsBlockErrors {
if options.fixWarcFieldsBlockErrors && !blockValidation.Valid() {
// Write corrected warc fields block to content buffer
b := bytes.Buffer{}
_, err = wfb.WarcFields().Write(&b)
Expand Down

0 comments on commit 69ba622

Please sign in to comment.