Skip to content

Commit

Permalink
Merge pull request #50 from nlnwa/eof_panic
Browse files Browse the repository at this point in the history
Validate content-length and avoid panic on unexpected EOF
  • Loading branch information
maeb authored Nov 24, 2021
2 parents 16f1eea + 981bef7 commit e60a399
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 26 deletions.
6 changes: 6 additions & 0 deletions block.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type Block interface {
// RawBytes returns the bytes of the Block
RawBytes() (io.Reader, error)
BlockDigest() string
Size() int64
IsCached() bool
Cache() error
}
Expand Down Expand Up @@ -117,4 +118,9 @@ func (block *genericBlock) BlockDigest() string {
return block.blockDigestString
}

func (block *genericBlock) Size() int64 {
block.BlockDigest()
return block.blockDigest.count
}

var errContentReAccessed = errors.New("gowarc.Block: tried to access content twice")
29 changes: 22 additions & 7 deletions digest.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,23 @@ import (

type digest struct {
hash.Hash
name string
hash string
name string
hash string
count int64
}

// Write (via the embedded io.Writer interface) adds more data to the running hash.
// It never returns an error.
func (d *digest) Write(p []byte) (n int, err error) {
d.count += int64(len(p))
return d.Hash.Write(p)
}

// Sum appends the current hash to b and returns the resulting slice.
// It does not change the underlying hash state.
func (d *digest) Sum(b []byte) []byte {
d.count += int64(len(b))
return d.Hash.Sum(b)
}

func (d *digest) format() string {
Expand All @@ -55,15 +70,15 @@ func newDigest(digestString string) (*digest, error) {
}
switch algorithm {
case "md5":
return &digest{md5.New(), algorithm, hash}, nil
return &digest{md5.New(), algorithm, hash, 0}, nil
case "sha1":
return &digest{sha1.New(), algorithm, hash}, nil
return &digest{sha1.New(), algorithm, hash, 0}, nil
case "sha256":
return &digest{sha256.New(), algorithm, hash}, nil
return &digest{sha256.New(), algorithm, hash, 0}, nil
case "sha512":
return &digest{sha512.New(), algorithm, hash}, nil
return &digest{sha512.New(), algorithm, hash, 0}, nil
case "":
return &digest{sha1.New(), "sha1", hash}, nil
return &digest{sha1.New(), "sha1", hash, 0}, nil
default:
return nil, fmt.Errorf("unsupported digest algorithm '%s'", algorithm)
}
Expand Down
10 changes: 10 additions & 0 deletions httpblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ func (block *httpRequestBlock) BlockDigest() string {
return block.blockDigestString
}

func (block *httpRequestBlock) Size() int64 {
block.BlockDigest()
return block.blockDigest.count
}

func (block *httpRequestBlock) PayloadBytes() (io.Reader, error) {
if block.filterReader == nil {
block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest)
Expand Down Expand Up @@ -237,6 +242,11 @@ func (block *httpResponseBlock) BlockDigest() string {
return block.blockDigestString
}

func (block *httpResponseBlock) Size() int64 {
block.BlockDigest()
return block.blockDigest.count
}

func (block *httpResponseBlock) PayloadBytes() (io.Reader, error) {
if block.filterReader == nil {
block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest)
Expand Down
38 changes: 38 additions & 0 deletions record.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,25 @@ type WarcRecord interface {
// RevisitRef extracts a RevisitRef current record if it is a revisit record.
RevisitRef() (*RevisitRef, error)
// CreateRevisitRef creates a RevisitRef which references the current record.
//
// The RevisitRef might be used by another records ToRevisitRecord to create a revisit record referencing this record.
CreateRevisitRef(profile string) (*RevisitRef, error)
// Merge merges this record with its referenced record(s)
//
// It is implemented only for revisit records, but this function will be enhanced to also support segmented records.
Merge(record ...WarcRecord) (WarcRecord, error)
// ValidateDigest validates block and payload digests if present.
//
// If option FixDigest is set, an invalid or missing digest will be corrected in the header.
// Digest validation requires the whole content block to be read. As a side effect the Content-Length field is also validated
// and if option FixContentLength is set, a wrong content length will be corrected in the header.
//
// If the record is not cached, it might not be possible to read any content from this record after validation.
//
// The result is dependent on the SpecViolationPolicy option:
// ErrIgnore: only fatal errors are returned.
// ErrWarn: all errors found will be added to the Validation.
// ErrFail: the first error is returned and no more validation is done.
ValidateDigest(validation *Validation) error
}

Expand Down Expand Up @@ -369,11 +383,35 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err
}

// ValidateDigest validates block and payload digests if present.
//
// If option FixDigest is set, an invalid or missing digest will be corrected in the header.
// Digest validation requires the whole content block to be read. As a side effect the Content-Length field is also validated
// and if option FixContentLength is set, a wrong content length will be corrected in the header.
//
// If the record is not cached, it might not be possible to read any content from this record after validation.
//
// The result is dependent on the SpecViolationPolicy option:
// ErrIgnore: only fatal errors are returned.
// ErrWarn: all errors found will be added to the Validation.
// ErrFail: the first error is returned and no more validation is done.
func (wr *warcRecord) ValidateDigest(validation *Validation) error {
wr.Block().BlockDigest()

size := strconv.FormatInt(wr.block.Size(), 10)
if wr.opts.errSpec > ErrIgnore {
if wr.WarcHeader().Has(ContentLength) && size != wr.headers.Get(ContentLength) {
switch wr.opts.errSpec {
case ErrWarn:
validation.addError(fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size))
if wr.opts.fixContentLength {
wr.WarcHeader().Set(ContentLength, size)
}
case ErrFail:
return fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size)
}
}
}

var blockDigest, payloadDigest *digest
switch v := wr.Block().(type) {
case *genericBlock:
Expand Down
14 changes: 0 additions & 14 deletions recordbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package gowarc

import (
"fmt"
"github.com/nlnwa/gowarc/internal/diskbuffer"
"io"
"strconv"
Expand Down Expand Up @@ -151,19 +150,6 @@ func (rb *recordBuilder) validate(wr *warcRecord) (*Validation, error) {
return validation, err
}

if rb.opts.errSpec > ErrIgnore {
if wr.WarcHeader().Has(ContentLength) && size != wr.headers.Get(ContentLength) {
switch rb.opts.errSpec {
case ErrWarn:
validation.addError(fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size))
if rb.opts.fixContentLength {
wr.WarcHeader().Set(ContentLength, size)
}
case ErrFail:
return validation, fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size)
}
}
}
return validation, err
}

Expand Down
4 changes: 4 additions & 0 deletions revisitblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ func (block *revisitBlock) PayloadDigest() string {
return block.payloadDigestString
}

func (block *revisitBlock) Size() int64 {
return int64(len(block.headerBytes))
}

func (block *revisitBlock) Write(w io.Writer) (int64, error) {
p, err := block.RawBytes()
if err != nil {
Expand Down
5 changes: 3 additions & 2 deletions unmarshaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,11 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation
_, err = u.gz.Read(b)
}
if err != io.EOF {
panic(err)
_ = u.gz.Close()
return err
}
if err := u.gz.Close(); err != nil {
panic(err)
return err
}
}
return err
Expand Down
38 changes: 35 additions & 3 deletions unmarshaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"io/ioutil"
"strconv"
"strings"
"testing"
)
Expand Down Expand Up @@ -529,6 +528,41 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
0,
true,
},
{
"short response record",
[]WarcRecordOption{WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore)},
"WARC/1.0\r\n" +
"WARC-Date: 2017-03-06T04:03:53Z\r\n" +
"WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\n" +
"WARC-Type: response\r\n" +
"Content-Type: application/http;msgtype=response\r\n" +
"Content-Length: 257\r\n" +
"\r\n" +
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the " +
"\r\n\r\n",
want{
V1_0,
Response,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "response"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: ContentLength, Value: "257"},
},
&httpResponseBlock{},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the " +
"\r\n\r\n",
&Validation{},
false,
},
0,
true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand All @@ -555,8 +589,6 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
content, err := ioutil.ReadAll(r)
assert.Nil(err)

contentLength, _ := strconv.Atoi(gotRecord.WarcHeader().Get(ContentLength))
assert.Equal(contentLength, len(content), "ContentLength")
assert.Equal(tt.want.content, string(content), "Content")
assert.Equal(tt.wantOffset, gotOffset, "Offset")

Expand Down
4 changes: 4 additions & 0 deletions warcfieldsblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ func (b *warcFieldsBlock) BlockDigest() string {
return b.blockDigest.format()
}

func (block *warcFieldsBlock) Size() int64 {
return int64(len(block.content))
}

func (b *warcFieldsBlock) Write(w io.Writer) (bytesWritten int64, err error) {
bytesWritten, err = b.warcFields.Write(w)
if err != nil {
Expand Down

0 comments on commit e60a399

Please sign in to comment.