Skip to content

Commit

Permalink
Exported function for validate digest
Browse files Browse the repository at this point in the history
  • Loading branch information
johnerikhalse committed Sep 2, 2021
1 parent 246889b commit 9cc7114
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 47 deletions.
74 changes: 53 additions & 21 deletions record.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type WarcRecord interface {
ToRevisitRecord(ref *RevisitRef) (WarcRecord, error)
RevisitRef() (*RevisitRef, error)
Merge(record ...WarcRecord) (WarcRecord, error)
ValidateDigest(validation *Validation) error
}

type WarcVersion struct {
Expand Down Expand Up @@ -189,7 +190,7 @@ func (wr *warcRecord) ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) {
}
case ProfileServerNotModified:
default:
return nil, fmt.Errorf("Unknown revisit profile")
return nil, fmt.Errorf("unknown revisit profile")
}

h.Set(WarcType, Revisit.String())
Expand Down Expand Up @@ -261,7 +262,7 @@ func (wr *warcRecord) Merge(record ...WarcRecord) (WarcRecord, error) {

b, ok := wr.block.(*revisitBlock)
if !ok {
return nil, fmt.Errorf("the revisit record's has wrong block type. Creation of record must be done with SkipParseBlock set to false.")
return nil, fmt.Errorf("the revisit record's has wrong block type. Creation of record must be done with SkipParseBlock set to false")
}
switch v := record[0].Block().(type) {
case *httpRequestBlock:
Expand Down Expand Up @@ -294,7 +295,16 @@ func (wr *warcRecord) Merge(record ...WarcRecord) (WarcRecord, error) {
return wr, nil
}

func (wr *warcRecord) parseBlock(reader io.Reader, blockDigest, payloadDigest *digest, validation *Validation) (err error) {
func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err error) {
blockDigest, err := newDigestFromField(wr, WarcBlockDigest)
if err != nil {
return err
}
payloadDigest, err := newDigestFromField(wr, WarcPayloadDigest)
if err != nil {
return err
}

if !wr.opts.skipParseBlock {
contentType := strings.ToLower(wr.headers.Get(ContentType))
if wr.recordType&(Response|Resource|Request|Conversion|Continuation) != 0 {
Expand Down Expand Up @@ -329,25 +339,47 @@ func (wr *warcRecord) parseBlock(reader io.Reader, blockDigest, payloadDigest *d
return
}

func (wr *warcRecord) validateDigest(blockDigest, payloadDigest *digest, validation *Validation) error {
// ValidateDigest validates block and payload digests if present.
// If option FixDigest is set, an invalid or missing digest will be corrected in the header.
// If the record is not cached, it might not be possible to read any content from this record after validation.
func (wr *warcRecord) ValidateDigest(validation *Validation) error {
wr.Block().BlockDigest()
if blockDigest.hash == "" {
// Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added.
if wr.opts.fixDigest {
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())
return nil
}
} else {
if err := blockDigest.validate(); err != nil {
switch wr.opts.errSpec {
case ErrIgnore:
case ErrWarn:
validation.addError(err)
if wr.opts.fixDigest {
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())

var blockDigest, payloadDigest *digest
switch v := wr.Block().(type) {
case *genericBlock:
blockDigest = v.blockDigest
case *httpRequestBlock:
blockDigest = v.blockDigest
payloadDigest = v.payloadDigest
case *httpResponseBlock:
blockDigest = v.blockDigest
payloadDigest = v.payloadDigest
case *revisitBlock:
blockDigest = v.blockDigest
case *warcFieldsBlock:
blockDigest = v.blockDigest
}

if blockDigest != nil {
if blockDigest.hash == "" {
// Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added.
if wr.opts.fixDigest {
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())
return nil
}
} else {
if err := blockDigest.validate(); err != nil {
switch wr.opts.errSpec {
case ErrIgnore:
case ErrWarn:
validation.addError(err)
if wr.opts.fixDigest {
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())
}
case ErrFail:
return fmt.Errorf("wrong block digest " + err.Error())
}
case ErrFail:
return fmt.Errorf("wrong block digest " + err.Error())
}
}
}
Expand All @@ -358,7 +390,7 @@ func (wr *warcRecord) validateDigest(blockDigest, payloadDigest *digest, validat
return nil
}

if _, ok := wr.block.(PayloadBlock); ok {
if payloadDigest != nil {
if payloadDigest.hash == "" {
// Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added.
if wr.opts.fixDigest {
Expand Down
23 changes: 12 additions & 11 deletions recordbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,30 +46,40 @@ type recordBuilder struct {
content diskbuffer.Buffer
}

// Write implements the io.Writer interface
// Data written is added to the record's content block
func (rb *recordBuilder) Write(p []byte) (n int, err error) {
return rb.content.Write(p)
}

// WriteString implements the io.StringWriter interface
// Data written is added to the record's content block
func (rb *recordBuilder) WriteString(s string) (n int, err error) {
return rb.content.WriteString(s)
}

// ReadFrom implements the io.ReaderFrom interface
// Data written is added to the record's content block
func (rb *recordBuilder) ReadFrom(r io.Reader) (n int64, err error) {
return rb.content.ReadFrom(r)
}

// AddWarcHeader adds a new WARC header field with the given name and a string value to the record
func (rb *recordBuilder) AddWarcHeader(name string, value string) {
rb.headers.Add(name, value)
}

// AddWarcHeaderInt adds a new WARC header field with the given name and an int value to the record
func (rb *recordBuilder) AddWarcHeaderInt(name string, value int) {
rb.headers.Add(name, strconv.Itoa(value))
}

// AddWarcHeaderInt64 adds a new WARC header field with the given name and an int64 value to the record
func (rb *recordBuilder) AddWarcHeaderInt64(name string, value int64) {
rb.headers.Add(name, strconv.FormatInt(value, 10))
}

// AddWarcHeaderTime adds a new WARC header field with the given name and a time.Time value to the record
func (rb *recordBuilder) AddWarcHeaderTime(name string, value time.Time) {
rb.headers.Add(name, value.UTC().Format(time.RFC3339))
}
Expand Down Expand Up @@ -107,21 +117,12 @@ func (rb *recordBuilder) Build() (WarcRecord, *Validation, error) {
return wr, validation, err
}

blockDigest, err := newDigestFromField(wr, WarcBlockDigest)
if err != nil {
return wr, validation, err
}
payloadDigest, err := newDigestFromField(wr, WarcPayloadDigest)
if err != nil {
return wr, validation, err
}

err = wr.parseBlock(rb.content, blockDigest, payloadDigest, validation)
err = wr.parseBlock(rb.content, validation)
if err != nil {
return wr, validation, err
}

err = wr.validateDigest(blockDigest, payloadDigest, validation)
err = wr.ValidateDigest(validation)

return wr, validation, err
}
Expand Down
5 changes: 4 additions & 1 deletion revisitblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
type revisitBlock struct {
opts *warcRecordOptions
headerBytes []byte
blockDigest *digest
blockDigestString string
payloadDigestString string
}
Expand Down Expand Up @@ -65,7 +66,7 @@ func (block *revisitBlock) Write(w io.Writer) (int64, error) {
return bytesWritten, err
}

// newRevisitBlock creates a from a PayloadBlock
// newRevisitBlock creates a revisitBlock from a PayloadBlock
func newRevisitBlock(opts *warcRecordOptions, src Block) (*revisitBlock, error) {
block := &revisitBlock{
opts: opts,
Expand All @@ -87,6 +88,7 @@ func newRevisitBlock(opts *warcRecordOptions, src Block) (*revisitBlock, error)
return nil, err
}
block.blockDigestString = blockDigest.format()
block.blockDigest = blockDigest

return block, nil
}
Expand All @@ -106,6 +108,7 @@ func parseRevisitBlock(opts *warcRecordOptions, r io.Reader, blockDigest *digest
block.headerBytes = content.Bytes()

block.blockDigestString = blockDigest.format()
block.blockDigest = blockDigest

return block, nil
}
16 changes: 4 additions & 12 deletions unmarshaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"bytes"
"compress/gzip"
"fmt"
countingreader2 "github.com/nlnwa/gowarc/internal/countingreader"
"github.com/nlnwa/gowarc/internal/countingreader"
log "github.com/sirupsen/logrus"
"io"
"io/ioutil"
Expand Down Expand Up @@ -142,24 +142,16 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation

// Adding 4 bytes to length to include the end of record marker (\r\n\r\n)
// TODO: validate that record ends with correct marker
c2 := countingreader2.NewLimited(r, length+4)
c2 := countingreader.NewLimited(r, length+4)
record.closer = func() error {
_, err := io.Copy(ioutil.Discard, c2)
if g != nil {
g.Close()
_ = g.Close()
}
return err
}

blockDigest, err := newDigestFromField(record, WarcBlockDigest)
if err != nil {
return record, offset, validation, err
}
payloadDigest, err := newDigestFromField(record, WarcPayloadDigest)
if err != nil {
return record, offset, validation, err
}
err = record.parseBlock(bufio.NewReader(c2), blockDigest, payloadDigest, validation)
err = record.parseBlock(bufio.NewReader(c2), validation)

return record, offset, validation, err
}
Expand Down
21 changes: 19 additions & 2 deletions unmarshaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
"WARC-Filename: temp-20170306040353.warc.gz\r\n" +
"WARC-Type: warcinfo\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Warc-Block-Digest: sha1:AF4D582B4FFC017D07A947D841E392A821F754F3\r\n" +
"Warc-Block-Digest: sha1:BBB3E40054DF0B7BA6DD470D2FA561722D9EDBAC\r\n" +
"Content-Length: 240\r\n" +
"\r\n" +
"software: Veidemann v1.0\r\n" +
Expand All @@ -70,7 +70,7 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcFilename, Value: "temp-20170306040353.warc.gz"},
&nameValue{Name: ContentType, Value: "application/warc-fields"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:AF4D582B4FFC017D07A947D841E392A821F754F3"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:BBB3E40054DF0B7BA6DD470D2FA561722D9EDBAC"},
&nameValue{Name: ContentLength, Value: "240"},
},
blockType: &warcFieldsBlock{},
Expand Down Expand Up @@ -408,6 +408,15 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
u := NewUnmarshaler(tt.opts...)
data := bufio.NewReader(strings.NewReader(tt.input))
gotRecord, gotOffset, validation, err := u.Unmarshal(data)
defer func() {
err := gotRecord.Close()
if tt.wantErr {
require.Error(err)
} else {
require.NoError(err)
}
}()

if tt.wantErr {
require.Error(err)
} else {
Expand All @@ -430,6 +439,14 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
assert.Equal(contentLength, len(content), "ContentLength")
assert.Equal(tt.want.content, string(content), "Content")
assert.Equal(tt.wantOffset, gotOffset, "Offset")

err = gotRecord.ValidateDigest(validation)
if tt.wantErr {
require.Error(err)
} else {
require.NoError(err)
}
assert.True(validation.Valid(), validation.String())
})
}
}

0 comments on commit 9cc7114

Please sign in to comment.