diff --git a/block.go b/block.go index 6b5017c..738202f 100644 --- a/block.go +++ b/block.go @@ -21,7 +21,6 @@ import ( "github.com/nlnwa/gowarc/internal/diskbuffer" "io" "io/ioutil" - "sync" ) // Block is the interface used to represent the content of a WARC record as specified by the WARC specification: @@ -49,91 +48,73 @@ type PayloadBlock interface { } type genericBlock struct { - opts *warcRecordOptions - rawBytes io.Reader - blockDigest *digest - digestOnce sync.Once - readOp readOp - cached bool + opts *warcRecordOptions + rawBytes io.Reader + blockDigest *digest + filterReader *digestFilterReader + blockDigestString string } func newGenericBlock(opts *warcRecordOptions, r io.Reader, d *digest) *genericBlock { - b := &genericBlock{opts: opts, rawBytes: r, blockDigest: d} - if _, ok := r.(io.Seeker); ok { - b.cached = true - } - return b + return &genericBlock{opts: opts, rawBytes: r, blockDigest: d} } func (block *genericBlock) IsCached() bool { - return block.cached + _, ok := block.rawBytes.(io.Seeker) + return ok } func (block *genericBlock) Cache() error { - if block.cached { + if block.IsCached() { return nil } - if block.readOp != opInitial { - return errContentReAccessed + + r, err := block.RawBytes() + if err != nil { + return err } + buf := diskbuffer.New(block.opts.bufferOptions...) - if _, err := buf.ReadFrom(block.rawBytes); err != nil { + if _, err := buf.ReadFrom(r); err != nil { return err } if c, ok := block.rawBytes.(io.Closer); ok { _ = c.Close() } + block.blockDigestString = block.blockDigest.format() block.rawBytes = buf - block.cached = true return nil } func (block *genericBlock) RawBytes() (io.Reader, error) { - if block.cached { - if _, err := block.rawBytes.(io.Seeker).Seek(0, io.SeekStart); err != nil { - return nil, err - } - return block.rawBytes, nil + if block.filterReader == nil { + block.filterReader = newDigestFilterReader(block.rawBytes, block.blockDigest) + return block.filterReader, nil + } + + if block.blockDigestString == "" { + block.BlockDigest() } - // Block is not cached. Guard against calling more than once - if block.readOp != opInitial { + if !block.IsCached() { return nil, errContentReAccessed } - block.readOp = opRawBytes - block.rawBytes = io.TeeReader(block.rawBytes, block.blockDigest) - return block.rawBytes, nil + if _, err := block.rawBytes.(io.Seeker).Seek(0, io.SeekStart); err != nil { + return nil, err + } + return newDigestFilterReader(block.rawBytes), nil } func (block *genericBlock) BlockDigest() string { - block.digestOnce.Do(func() { - if block.cached { - if _, err := block.rawBytes.(io.Seeker).Seek(0, io.SeekStart); err != nil { - panic(err) - } - block.blockDigest.Reset() - _, _ = io.Copy(block.blockDigest, block.rawBytes) - return - } - - if block.readOp == opInitial { - _, _ = block.RawBytes() + if block.blockDigestString == "" { + if block.filterReader == nil { + block.filterReader = newDigestFilterReader(block.rawBytes, block.blockDigest) } - block.readOp = opRawBytes - _, _ = io.Copy(ioutil.Discard, block.rawBytes) - }) - return block.blockDigest.format() + _, _ = io.Copy(ioutil.Discard, block.filterReader) + block.blockDigestString = block.blockDigest.format() + } + return block.blockDigestString } -// The readOp constants describe access to RawBytes() or PayloadBytes() on a PayloadBlock(), -// so that RawBytes and PayloadBytes() can check for invalid usage. -type readOp int8 - -const ( - opInitial readOp = 0 // Initial value. - opRawBytes readOp = 1 - opPayloadBytes readOp = 2 -) - var errContentReAccessed = errors.New("gowarc.Block: tried to access content twice") diff --git a/block_test.go b/block_test.go index ff3b192..7093fb2 100644 --- a/block_test.go +++ b/block_test.go @@ -380,7 +380,7 @@ func Test_httpRequestBlock_IsCached(t *testing.T) { { "diskbuffer.Buffer", func() io.Reader { d := diskbuffer.New(); _, _ = d.WriteString(content); return d }(), - false, + true, }, { "iotest.HalfReader", @@ -527,7 +527,7 @@ func Test_httpResponseBlock_IsCached(t *testing.T) { { "diskbuffer.Buffer", func() io.Reader { d := diskbuffer.New(); _, _ = d.WriteString(content); return d }(), - false, + true, }, { "iotest.HalfReader", diff --git a/digest.go b/digest.go index fa54864..04b15e5 100644 --- a/digest.go +++ b/digest.go @@ -23,6 +23,7 @@ import ( "crypto/sha512" "fmt" "hash" + "io" "strings" ) @@ -67,3 +68,32 @@ func newDigest(digestString string) (*digest, error) { return nil, fmt.Errorf("unsupported digest algorithm '%s'", algorithm) } } + +func newDigestFromField(wr *warcRecord, warcDigestField string) (d *digest, err error) { + if wr.WarcHeader().Has(warcDigestField) { + d, err = newDigest(wr.WarcHeader().Get(warcDigestField)) + } else { + d, err = newDigest(wr.opts.defaultDigestAlgorithm) + } + return +} + +type digestFilterReader struct { + src io.Reader + digests []*digest +} + +func newDigestFilterReader(src io.Reader, digests ...*digest) *digestFilterReader { + return &digestFilterReader{src: src, digests: digests} +} + +func (d digestFilterReader) Read(p []byte) (n int, err error) { + n, err = d.src.Read(p) + if n > 0 { + pp := p[:n] + for _, dd := range d.digests { + dd.Write(pp) + } + } + return +} diff --git a/httpblock.go b/httpblock.go index 9061cfe..b278f7a 100644 --- a/httpblock.go +++ b/httpblock.go @@ -43,94 +43,90 @@ type HttpResponseBlock interface { } type httpRequestBlock struct { - opts *warcRecordOptions - httpRequestLine string - httpHeader *http.Header - httpHeaderBytes []byte - payload io.Reader - blockDigest *digest - payloadDigest *digest - digestComputed bool - readOp readOp - parseHeaderOnce sync.Once - cached bool + opts *warcRecordOptions + httpRequestLine string + httpHeader *http.Header + httpHeaderBytes []byte + payload io.Reader + blockDigest *digest + payloadDigest *digest + filterReader *digestFilterReader + blockDigestString string + payloadDigestString string + parseHeaderOnce sync.Once } func (block *httpRequestBlock) IsCached() bool { - return block.cached + _, ok := block.payload.(io.Seeker) + return ok } func (block *httpRequestBlock) Cache() error { - if block.cached { + if block.IsCached() { return nil } - if block.readOp != opInitial { - return errContentReAccessed + + r, err := block.PayloadBytes() + if err != nil { + return err } + buf := diskbuffer.New(block.opts.bufferOptions...) - if _, err := buf.ReadFrom(block.payload); err != nil { + if _, err := buf.ReadFrom(r); err != nil { return err } if c, ok := block.payload.(io.Closer); ok { _ = c.Close() } - block.digestComputed = true + block.blockDigestString = block.blockDigest.format() + block.payloadDigestString = block.payloadDigest.format() block.payload = buf - block.cached = true return nil } func (block *httpRequestBlock) RawBytes() (io.Reader, error) { - if block.cached { - if _, err := block.payload.(io.Seeker).Seek(0, io.SeekStart); err != nil { - return nil, err - } - return io.MultiReader(bytes.NewReader(block.httpHeaderBytes), block.payload), nil - } - - // Block is not cached. Guard against calling more than once - if block.readOp != opInitial { - return nil, errContentReAccessed + r, err := block.PayloadBytes() + if err != nil { + return nil, err } - block.readOp = opRawBytes - return io.MultiReader(bytes.NewReader(block.httpHeaderBytes), block.payload), nil + return io.MultiReader(bytes.NewReader(block.httpHeaderBytes), r), nil } func (block *httpRequestBlock) BlockDigest() string { - block.readOp = opRawBytes - if !block.digestComputed { - if _, err := io.Copy(ioutil.Discard, block.payload); err != nil { - panic(err) + if block.blockDigestString == "" { + if block.filterReader == nil { + block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest) } - block.digestComputed = true + _, _ = io.Copy(ioutil.Discard, block.filterReader) + block.blockDigestString = block.blockDigest.format() + block.payloadDigestString = block.payloadDigest.format() } - return block.blockDigest.format() + return block.blockDigestString } func (block *httpRequestBlock) PayloadBytes() (io.Reader, error) { - if block.cached { - if _, err := block.payload.(io.Seeker).Seek(0, io.SeekStart); err != nil { - return nil, err - } - return block.payload, nil + if block.filterReader == nil { + block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest) + return block.filterReader, nil } - if block.readOp != opInitial { + if block.blockDigestString == "" { + block.BlockDigest() + } + + if !block.IsCached() { return nil, errContentReAccessed } - block.readOp = opPayloadBytes - return block.payload, nil + + if _, err := block.payload.(io.Seeker).Seek(0, io.SeekStart); err != nil { + return nil, err + } + return newDigestFilterReader(block.payload), nil } func (block *httpRequestBlock) PayloadDigest() string { - block.readOp = opRawBytes - if !block.digestComputed { - if _, err := io.Copy(ioutil.Discard, block.payload); err != nil { - panic(err) - } - block.digestComputed = true - } - return block.payloadDigest.format() + block.BlockDigest() + return block.payloadDigestString } func (block *httpRequestBlock) HttpHeaderBytes() []byte { @@ -179,95 +175,91 @@ func (block *httpRequestBlock) Write(w io.Writer) (int64, error) { } type httpResponseBlock struct { - opts *warcRecordOptions - httpStatusLine string - httpStatusCode int - httpHeader *http.Header - httpHeaderBytes []byte - payload io.Reader - blockDigest *digest - payloadDigest *digest - digestComputed bool - readOp readOp - parseHeaderOnce sync.Once - cached bool + opts *warcRecordOptions + httpStatusLine string + httpStatusCode int + httpHeader *http.Header + httpHeaderBytes []byte + payload io.Reader + blockDigest *digest + payloadDigest *digest + filterReader *digestFilterReader + blockDigestString string + payloadDigestString string + parseHeaderOnce sync.Once } func (block *httpResponseBlock) IsCached() bool { - return block.cached + _, ok := block.payload.(io.Seeker) + return ok } func (block *httpResponseBlock) Cache() error { - if block.cached { + if block.IsCached() { return nil } - if block.readOp != opInitial { - return errContentReAccessed + + r, err := block.PayloadBytes() + if err != nil { + return err } + buf := diskbuffer.New(block.opts.bufferOptions...) - if _, err := buf.ReadFrom(block.payload); err != nil { + if _, err := buf.ReadFrom(r); err != nil { return err } if c, ok := block.payload.(io.Closer); ok { _ = c.Close() } - block.digestComputed = true + block.blockDigestString = block.blockDigest.format() + block.payloadDigestString = block.payloadDigest.format() block.payload = buf - block.cached = true return nil } func (block *httpResponseBlock) RawBytes() (io.Reader, error) { - if block.cached { - if _, err := block.payload.(io.Seeker).Seek(0, io.SeekStart); err != nil { - return nil, err - } - return io.MultiReader(bytes.NewReader(block.httpHeaderBytes), block.payload), nil - } - - // Block is not cached. Guard against calling more than once - if block.readOp != opInitial { - return nil, errContentReAccessed + r, err := block.PayloadBytes() + if err != nil { + return nil, err } - block.readOp = opRawBytes - return io.MultiReader(bytes.NewReader(block.httpHeaderBytes), block.payload), nil + return io.MultiReader(bytes.NewReader(block.httpHeaderBytes), r), nil } func (block *httpResponseBlock) BlockDigest() string { - block.readOp = opRawBytes - if !block.digestComputed { - if _, err := io.Copy(ioutil.Discard, block.payload); err != nil { - panic(err) + if block.blockDigestString == "" { + if block.filterReader == nil { + block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest) } - block.digestComputed = true + _, _ = io.Copy(ioutil.Discard, block.filterReader) + block.blockDigestString = block.blockDigest.format() + block.payloadDigestString = block.payloadDigest.format() } - return block.blockDigest.format() + return block.blockDigestString } func (block *httpResponseBlock) PayloadBytes() (io.Reader, error) { - if block.cached { - if _, err := block.payload.(io.Seeker).Seek(0, io.SeekStart); err != nil { - return nil, err - } - return block.payload, nil + if block.filterReader == nil { + block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest) + return block.filterReader, nil + } + + if block.blockDigestString == "" { + block.BlockDigest() } - if block.readOp != opInitial { + if !block.IsCached() { return nil, errContentReAccessed } - block.readOp = opPayloadBytes - return block.payload, nil + + if _, err := block.payload.(io.Seeker).Seek(0, io.SeekStart); err != nil { + return nil, err + } + return newDigestFilterReader(block.payload), nil } func (block *httpResponseBlock) PayloadDigest() string { - block.readOp = opRawBytes - if !block.digestComputed { - if _, err := io.Copy(ioutil.Discard, block.payload); err != nil { - panic(err) - } - block.digestComputed = true - } - return block.payloadDigest.format() + block.BlockDigest() + return block.payloadDigestString } func (block *httpResponseBlock) HttpHeaderBytes() []byte { @@ -324,7 +316,7 @@ func (block *httpResponseBlock) Write(w io.Writer) (int64, error) { } // headerBytes reads the http-headers into a byte array. -func headerBytes(r *bufio.Reader) []byte { +func headerBytes(r buffer) []byte { result := bytes.Buffer{} for { line, err := r.ReadBytes('\n') @@ -339,8 +331,20 @@ func headerBytes(r *bufio.Reader) []byte { return result.Bytes() } +type buffer interface { + Read(p []byte) (n int, err error) + ReadBytes(delim byte) ([]byte, error) + Peek(n int) ([]byte, error) +} + func newHttpBlock(opts *warcRecordOptions, r io.Reader, blockDigest, payloadDigest *digest) (PayloadBlock, error) { - rb := bufio.NewReader(r) + var rb buffer + if v, ok := r.(diskbuffer.Buffer); ok { + rb = v + } else { + rb = bufio.NewReader(r) + } + b, err := rb.Peek(4) if err != nil { return nil, fmt.Errorf("not a http block: %w", err) @@ -351,7 +355,13 @@ func newHttpBlock(opts *warcRecordOptions, r io.Reader, blockDigest, payloadDige return nil, err } - payload := io.TeeReader(io.TeeReader(rb, blockDigest), payloadDigest) + var payload buffer + if _, ok := rb.(diskbuffer.Buffer); ok { + payload = rb.(diskbuffer.Buffer).Slice(int64(len(hb)), 0) + } else { + payload = rb + } + if bytes.HasPrefix(b, []byte("HTTP")) { resp := &httpResponseBlock{ opts: opts, diff --git a/marshaler.go b/marshaler.go index 12528e5..cab35a9 100644 --- a/marshaler.go +++ b/marshaler.go @@ -72,7 +72,6 @@ func (m *defaultMarshaler) writeRecord(w io.Writer, record WarcRecord) (int64, e return bytesWritten, err } bw, err = io.Copy(w, r) - //bw, err = record.Block().RawBytes().WriteTo(w) bytesWritten += bw if err != nil { return bytesWritten, err diff --git a/options.go b/options.go index 2645d33..4835314 100644 --- a/options.go +++ b/options.go @@ -29,6 +29,7 @@ type warcRecordOptions struct { addMissingDigest bool fixContentLength bool fixDigest bool + defaultDigestAlgorithm string bufferOptions []diskbuffer.Option } @@ -72,6 +73,7 @@ func defaultWarcRecordOptions() warcRecordOptions { addMissingRecordId: true, addMissingContentLength: true, addMissingDigest: true, + defaultDigestAlgorithm: "sha1", fixContentLength: true, fixDigest: true, } @@ -143,6 +145,15 @@ func WithAddMissingDigest(addMissingDigest bool) WarcRecordOption { }) } +// WithDefaultDigestAlgorithm sets which algorihm to use for digest generation. +// Valid values: 'md5', 'sha1', 'sha256' and 'sha512'. +// defaults to sha1 +func WithDefaultDigestAlgorithm(defaultDigestAlgorithm string) WarcRecordOption { + return newFuncWarcRecordOption(func(o *warcRecordOptions) { + o.defaultDigestAlgorithm = defaultDigestAlgorithm + }) +} + // WithFixContentLength sets if a ContentLength header with value which do not match the actual content length should be set to the real value. // This will not have any impact if SpecViolationPolicy is ErrIgnore // defaults to true diff --git a/record.go b/record.go index 877a8d5..01837d2 100644 --- a/record.go +++ b/record.go @@ -286,7 +286,7 @@ func (wr *warcRecord) Merge(record ...WarcRecord) (WarcRecord, error) { return nil, fmt.Errorf("merging of revisits is only implemented for http requests and responses") } if record[0].Block().IsCached() { - wr.headers.Set(WarcBlockDigest, "TODO") + wr.headers.Set(WarcBlockDigest, record[0].Block().BlockDigest()) } else { wr.headers.Delete(WarcBlockDigest) } @@ -294,15 +294,12 @@ func (wr *warcRecord) Merge(record ...WarcRecord) (WarcRecord, error) { return wr, nil } -func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err error) { - d, _ := newDigest("sha1") - +func (wr *warcRecord) parseBlock(reader io.Reader, blockDigest, payloadDigest *digest, validation *Validation) (err error) { if !wr.opts.skipParseBlock { contentType := strings.ToLower(wr.headers.Get(ContentType)) if wr.recordType&(Response|Resource|Request|Conversion|Continuation) != 0 { if strings.HasPrefix(contentType, ApplicationHttp) { - pd, _ := newDigest("sha1") - httpBlock, err := newHttpBlock(wr.opts, reader, d, pd) + httpBlock, err := newHttpBlock(wr.opts, reader, blockDigest, payloadDigest) if err != nil { return err } @@ -311,7 +308,7 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err } } if wr.recordType == Revisit { - revisitBlock, err := parseRevisitBlock(wr.opts, reader, wr.headers.Get(WarcBlockDigest), wr.headers.Get(WarcPayloadDigest)) + revisitBlock, err := parseRevisitBlock(wr.opts, reader, blockDigest, wr.headers.Get(WarcPayloadDigest)) if err != nil { return err } @@ -319,7 +316,7 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err return nil } if strings.HasPrefix(contentType, ApplicationWarcFields) { - warcFieldsBlock, err := newWarcFieldsBlock(reader, d, validation, wr.opts) + warcFieldsBlock, err := newWarcFieldsBlock(reader, blockDigest, validation, wr.opts) if err != nil { return err } @@ -328,6 +325,60 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err } } - wr.block = newGenericBlock(wr.opts, reader, d) + wr.block = newGenericBlock(wr.opts, reader, blockDigest) return } + +func (wr *warcRecord) validateDigest(blockDigest, payloadDigest *digest, validation *Validation) error { + wr.Block().BlockDigest() + if blockDigest.hash == "" { + // Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added. + if wr.opts.fixDigest { + wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format()) + return nil + } + } else { + if err := blockDigest.validate(); err != nil { + switch wr.opts.errSpec { + case ErrIgnore: + case ErrWarn: + validation.addError(err) + if wr.opts.fixDigest { + wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format()) + } + case ErrFail: + return fmt.Errorf("wrong block digest " + err.Error()) + } + } + } + + if wr.Type() == Revisit || wr.WarcHeader().Has(WarcSegmentNumber) { + // Can't check payload digest for revisit records or segmented records since the payload digest is a digest of + // the original record + return nil + } + + if _, ok := wr.block.(PayloadBlock); ok { + if payloadDigest.hash == "" { + // Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added. + if wr.opts.fixDigest { + wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format()) + return nil + } + } else { + if err := payloadDigest.validate(); err != nil { + switch wr.opts.errSpec { + case ErrIgnore: + case ErrWarn: + validation.addError(err) + if wr.opts.fixDigest { + wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format()) + } + case ErrFail: + return fmt.Errorf("wrong payload digest " + err.Error()) + } + } + } + } + return nil +} diff --git a/record_test.go b/record_test.go index fabbfe6..43a9e33 100644 --- a/record_test.go +++ b/record_test.go @@ -200,6 +200,7 @@ func Test_warcRecord_Merge(t *testing.T) { &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: WarcType, Value: "response"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:6E9D6B234FEEBBF1AB618707217E577C3B83448A"}, &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, &nameValue{Name: ContentLength, Value: "257"}, @@ -208,7 +209,7 @@ func Test_warcRecord_Merge(t *testing.T) { "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content", &httpResponseBlock{}, - false, + true, }, false, }, @@ -247,7 +248,7 @@ func Test_warcRecord_Merge(t *testing.T) { } func createRecord1(recordType RecordType, headers *WarcFields, data string) WarcRecord { - rb := NewRecordBuilder(recordType, WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore)) + rb := NewRecordBuilder(recordType, WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore), WithFixDigest(false)) for _, nv := range *headers { rb.AddWarcHeader(nv.Name, nv.Value) } diff --git a/recordbuilder.go b/recordbuilder.go index 2fc5325..02c37cf 100644 --- a/recordbuilder.go +++ b/recordbuilder.go @@ -106,7 +106,23 @@ func (rb *recordBuilder) Build() (WarcRecord, *Validation, error) { if err != nil { return wr, validation, err } - err = wr.parseBlock(rb.content, validation) + + blockDigest, err := newDigestFromField(wr, WarcBlockDigest) + if err != nil { + return wr, validation, err + } + payloadDigest, err := newDigestFromField(wr, WarcPayloadDigest) + if err != nil { + return wr, validation, err + } + + err = wr.parseBlock(rb.content, blockDigest, payloadDigest, validation) + if err != nil { + return wr, validation, err + } + + err = wr.validateDigest(blockDigest, payloadDigest, validation) + return wr, validation, err } @@ -135,27 +151,6 @@ func (rb *recordBuilder) validate(wr *warcRecord) (*Validation, error) { wr.headers.Set(ContentLength, size) } } - - d, err := newDigest(wr.WarcHeader().Get(WarcBlockDigest)) - if err != nil { - return validation, err - } - if _, err := io.Copy(d, rb.content); err != nil { - return validation, err - } - if err := d.validate(); err != nil { - switch rb.opts.errSpec { - case ErrIgnore: - case ErrWarn: - validation.addError(err) - if rb.opts.fixDigest { - wr.WarcHeader().Set(WarcBlockDigest, d.format()) - } - case ErrFail: - return validation, fmt.Errorf("wrong block digest " + err.Error()) - } - } - _, err = rb.content.Seek(0, io.SeekStart) return validation, err } diff --git a/recordbuilder_test.go b/recordbuilder_test.go index a987595..278a803 100644 --- a/recordbuilder_test.go +++ b/recordbuilder_test.go @@ -93,6 +93,7 @@ func TestRecordBuilder(t *testing.T) { &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, &nameValue{Name: WarcBlockDigest, Value: "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, &nameValue{Name: ContentLength, Value: "257"}, }, "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" + @@ -105,6 +106,7 @@ func TestRecordBuilder(t *testing.T) { &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: WarcType, Value: "response"}, &nameValue{Name: WarcBlockDigest, Value: "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, &nameValue{Name: ContentLength, Value: "257"}, }, @@ -113,14 +115,14 @@ func TestRecordBuilder(t *testing.T) { "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content", &Validation{}, - false, + true, }, false, }, { "valid request record", args{ - []WarcRecordOption{WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore)}, + []WarcRecordOption{WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore), WithFixDigest(false)}, Request, &WarcFields{ &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, @@ -153,7 +155,7 @@ func TestRecordBuilder(t *testing.T) { "Connection: close\n" + "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n", &Validation{}, - false, + true, }, false, }, @@ -422,19 +424,6 @@ func TestRecordBuilder(t *testing.T) { assert.Equal(tt.want.data, string(b)) assert.Equal(tt.want.cached, wr.Block().IsCached()) - - //if !reflect.DeepEqual(got, tt.want) { - // t.Errorf("NewResponseRecord() got = %v, want %v", got, tt.want) - //} - - //w := NewWriter(tt.args.opts) - //fmt.Printf(">>>>>>>>>>>>>>>>>>>>>\n") - //n, err := w.WriteRecord(os.Stdout, wr) - //fmt.Printf("<<<<<<<<<<<<<<<<<<<<<\n") - //fmt.Printf("Bytes written: %v, BlockType %T, Err: %v\n", n, wr.Block(), err) - // - //resp := wr.Block().(HttpResponseBlock).HttpHeader() - //fmt.Printf("Http header: %v, Err: %v\n", resp, err) }) } } diff --git a/revisitblock.go b/revisitblock.go index b5fa2dc..f9b0380 100644 --- a/revisitblock.go +++ b/revisitblock.go @@ -23,10 +23,10 @@ import ( ) type revisitBlock struct { - opts *warcRecordOptions - headerBytes []byte - blockDigest string - payloadDigest string + opts *warcRecordOptions + headerBytes []byte + blockDigestString string + payloadDigestString string } func (block *revisitBlock) IsCached() bool { @@ -46,11 +46,11 @@ func (block *revisitBlock) PayloadBytes() (io.Reader, error) { } func (block *revisitBlock) BlockDigest() string { - return block.blockDigest + return block.blockDigestString } func (block *revisitBlock) PayloadDigest() string { - return block.payloadDigest + return block.payloadDigestString } func (block *revisitBlock) Write(w io.Writer) (int64, error) { @@ -74,36 +74,38 @@ func newRevisitBlock(opts *warcRecordOptions, src Block) (*revisitBlock, error) switch v := src.(type) { case HttpRequestBlock: block.headerBytes = v.HttpHeaderBytes() - block.payloadDigest = v.PayloadDigest() + block.payloadDigestString = v.PayloadDigest() case HttpResponseBlock: block.headerBytes = v.HttpHeaderBytes() - block.payloadDigest = v.PayloadDigest() + block.payloadDigestString = v.PayloadDigest() default: return nil, fmt.Errorf("making revisit of %T not supported", v) } - blockDigest, _ := newDigest("sha1") + blockDigest, _ := newDigest(block.opts.defaultDigestAlgorithm) if _, err := blockDigest.Write(block.headerBytes); err != nil { return nil, err } - block.blockDigest = blockDigest.format() + block.blockDigestString = blockDigest.format() return block, nil } // parseRevisitBlock creates a new revisitBlock from a reader -func parseRevisitBlock(opts *warcRecordOptions, r io.Reader, blockDigest, payloadDigest string) (*revisitBlock, error) { +func parseRevisitBlock(opts *warcRecordOptions, r io.Reader, blockDigest *digest, payloadDigest string) (*revisitBlock, error) { block := &revisitBlock{ - opts: opts, - blockDigest: blockDigest, - payloadDigest: payloadDigest, + opts: opts, + payloadDigestString: payloadDigest, } content := &bytes.Buffer{} - if _, err := io.Copy(content, r); err != nil { + rr := io.TeeReader(r, blockDigest) + if _, err := io.Copy(content, rr); err != nil { return nil, err } block.headerBytes = content.Bytes() + block.blockDigestString = blockDigest.format() + return block, nil } diff --git a/unmarshaler.go b/unmarshaler.go index 0362de0..dbff672 100644 --- a/unmarshaler.go +++ b/unmarshaler.go @@ -151,7 +151,15 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation return err } - err = record.parseBlock(bufio.NewReader(c2), validation) + blockDigest, err := newDigestFromField(record, WarcBlockDigest) + if err != nil { + return record, offset, validation, err + } + payloadDigest, err := newDigestFromField(record, WarcPayloadDigest) + if err != nil { + return record, offset, validation, err + } + err = record.parseBlock(bufio.NewReader(c2), blockDigest, payloadDigest, validation) return record, offset, validation, err } diff --git a/warcfile_test.go b/warcfile_test.go index 72976a4..607ea77 100644 --- a/warcfile_test.go +++ b/warcfile_test.go @@ -384,7 +384,7 @@ const ( ) func createTestRecord() WarcRecord { - builder := NewRecordBuilder(Response) + builder := NewRecordBuilder(Response, WithFixDigest(false), WithStrictValidation()) _, err := builder.WriteString("HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content\n") @@ -395,7 +395,7 @@ func createTestRecord() WarcRecord { builder.AddWarcHeader(WarcDate, "2006-01-02T15:04:05Z") builder.AddWarcHeader(ContentLength, "258") builder.AddWarcHeader(ContentType, "application/http;msgtype=response") - builder.AddWarcHeader(WarcBlockDigest, "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4") + builder.AddWarcHeader(WarcBlockDigest, "sha1:7CBE117BFA2B22C3A02DEFF3BC04D5F912964A45") wr, _, err := builder.Build() if err != nil {