From 0b88479880afb00f54517da37697270ff4878e91 Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Fri, 22 Oct 2021 12:32:40 +0200 Subject: [PATCH] * Support making revisit records from resource records * Add utility method for creating a revisitRef pointing to the current record --- record.go | 35 ++++++++-- record_test.go | 170 ++++++++++++++++++++++++++++++++++++++++++++++++ revisitblock.go | 1 + 3 files changed, 202 insertions(+), 4 deletions(-) diff --git a/record.go b/record.go index 6ea51f7..8cdcc93 100644 --- a/record.go +++ b/record.go @@ -40,8 +40,15 @@ type WarcRecord interface { Block() Block String() string io.Closer + // ToRevisitRecord takes RevisitRef referencing the record we want to make a revisit of and returns a revisit record. ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) + // RevisitRef extracts a RevisitRef current record if it is a revisit record. RevisitRef() (*RevisitRef, error) + // CreateRevisitRef creates a RevisitRef which references the current record. + // The RevisitRef might be used by another records ToRevisitRecord to create a revisit record referencing this record. + CreateRevisitRef(profile string) (*RevisitRef, error) + // Merge merges this record with its referenced record(s) + // It is implemented only for revisit records, but this function will be enhanced to also support segmented records. Merge(record ...WarcRecord) (WarcRecord, error) ValidateDigest(validation *Validation) error } @@ -191,7 +198,10 @@ func (wr *warcRecord) ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) { case ProfileIdenticalPayloadDigestV1_0: fallthrough case ProfileIdenticalPayloadDigestV1_1: - if !wr.headers.Has(WarcPayloadDigest) { + if !h.Has(WarcPayloadDigest) && wr.recordType == Resource && h.Has(WarcBlockDigest) { + h.Set(WarcPayloadDigest, h.Get(WarcBlockDigest)) + } + if !h.Has(WarcPayloadDigest) { return nil, fmt.Errorf("payload digest is required for Identical Payload Digest Profile") } case ProfileServerNotModifiedV1_0: @@ -218,7 +228,6 @@ func (wr *warcRecord) ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) { return nil, err } h.Set(WarcBlockDigest, block.BlockDigest()) - h.Set(WarcPayloadDigest, block.PayloadDigest()) h.Set(ContentLength, strconv.Itoa(len(block.headerBytes))) revisit := &warcRecord{ @@ -244,6 +253,19 @@ func (wr *warcRecord) RevisitRef() (*RevisitRef, error) { }, nil } +func (wr *warcRecord) CreateRevisitRef(profile string) (*RevisitRef, error) { + if wr.recordType == Revisit { + return nil, fmt.Errorf("not allowed to reference a revisit record") + } + + return &RevisitRef{ + Profile: profile, + TargetRecordId: wr.headers.Get(WarcRecordID), + TargetUri: wr.headers.Get(WarcTargetURI), + TargetDate: wr.headers.Get(WarcDate), + }, nil +} + func (wr *warcRecord) Merge(record ...WarcRecord) (WarcRecord, error) { if wr.headers.Get(WarcSegmentNumber) == "1" { return nil, fmt.Errorf("merging of segmentet records is not implemented") @@ -356,6 +378,9 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error { switch v := wr.Block().(type) { case *genericBlock: blockDigest = v.blockDigest + if wr.recordType == Resource { + payloadDigest = blockDigest + } case *httpRequestBlock: blockDigest = v.blockDigest payloadDigest = v.payloadDigest @@ -370,7 +395,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error { if blockDigest != nil { if blockDigest.hash == "" { - // Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added. + // Missing digest header is allowed, so skip validation. But if addMissingDigest option is set, a header will be added. if wr.opts.addMissingDigest { wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format()) } @@ -381,6 +406,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error { case ErrWarn: validation.addError(fmt.Errorf("block: %w", err)) if wr.opts.fixDigest { + // Digest validation failed. But if fixDigest option is set, the calculated digest will be set. wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format()) } case ErrFail: @@ -398,7 +424,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error { if payloadDigest != nil { if payloadDigest.hash == "" { - // Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added. + // Missing digest header is allowed, so skip validation. But if addMissingDigest option is set, a header will be added. if wr.opts.addMissingDigest { wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format()) } @@ -408,6 +434,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error { case ErrIgnore: case ErrWarn: validation.addError(fmt.Errorf("payload: %w", err)) + // Digest validation failed. But if fixDigest option is set, the calculated digest will be set. if wr.opts.fixDigest { wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format()) } diff --git a/record_test.go b/record_test.go index 5885e09..8a3cdba 100644 --- a/record_test.go +++ b/record_test.go @@ -43,6 +43,7 @@ func Test_warcRecord_ToRevisitRecord(t *testing.T) { &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, &nameValue{Name: WarcBlockDigest, Value: "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, &nameValue{Name: ContentLength, Value: "257"}, }, "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+ @@ -68,6 +69,38 @@ func Test_warcRecord_ToRevisitRecord(t *testing.T) { }, false, }, + { + "ServerNotModified profile missing payload digest\"", + createRecord1(Response, + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"}, + &nameValue{Name: ContentLength, Value: "257"}, + }, + "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+ + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n"+ + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"), + &RevisitRef{Profile: ProfileServerNotModifiedV1_1, TargetRecordId: "targetId"}, + want{ + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: WarcType, Value: "revisit"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:BF9D96D3F3F230CE8E2C6A3E5E1D51A81016B55E"}, + &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, + &nameValue{Name: ContentLength, Value: "238"}, + &nameValue{Name: WarcProfile, Value: ProfileServerNotModifiedV1_1}, + &nameValue{Name: WarcRefersTo, Value: "targetId"}, + &nameValue{Name: WarcTruncated, Value: "length"}, + }, + "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" + + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" + + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\n", + }, + false, + }, { "IdenticalPayloadDigest profile", createRecord1(Response, @@ -119,6 +152,80 @@ func Test_warcRecord_ToRevisitRecord(t *testing.T) { want{}, true, }, + { + "IdenticalPayloadDigest profile resource record missing payload digest", + createRecord1(Resource, + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: ContentType, Value: "text/plain"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: ContentLength, Value: "19"}, + }, + "This is the content"), + &RevisitRef{Profile: ProfileIdenticalPayloadDigestV1_1, TargetRecordId: "targetId"}, + want{ + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: WarcType, Value: "revisit"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: ContentType, Value: "text/plain"}, + &nameValue{Name: ContentLength, Value: "0"}, + &nameValue{Name: WarcProfile, Value: ProfileIdenticalPayloadDigestV1_1}, + &nameValue{Name: WarcRefersTo, Value: "targetId"}, + &nameValue{Name: WarcTruncated, Value: "length"}, + }, + "", + }, + false, + }, + { + "IdenticalPayloadDigest profile metadata record", + createRecord1(Resource, + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: ContentType, Value: "text/plain"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: ContentLength, Value: "19"}, + }, + "This is the content"), + &RevisitRef{Profile: ProfileIdenticalPayloadDigestV1_1, TargetRecordId: "targetId"}, + want{ + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: WarcType, Value: "revisit"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: ContentType, Value: "text/plain"}, + &nameValue{Name: ContentLength, Value: "0"}, + &nameValue{Name: WarcProfile, Value: ProfileIdenticalPayloadDigestV1_1}, + &nameValue{Name: WarcRefersTo, Value: "targetId"}, + &nameValue{Name: WarcTruncated, Value: "length"}, + }, + "", + }, + false, + }, + { + "IdenticalPayloadDigest profile metadata record missing payload digest", + createRecord1(Metadata, + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: ContentType, Value: "text/plain"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: ContentLength, Value: "19"}, + }, + "This is the content"), + &RevisitRef{Profile: ProfileIdenticalPayloadDigestV1_1, TargetRecordId: "targetId"}, + want{}, + true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -170,6 +277,7 @@ func Test_warcRecord_Merge(t *testing.T) { "ServerNotModified profile", createRecord1(Revisit, &WarcFields{ + &nameValue{Name: WarcTargetURI, Value: "http://example.com"}, &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, @@ -178,6 +286,60 @@ func Test_warcRecord_Merge(t *testing.T) { &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, &nameValue{Name: WarcProfile, Value: ProfileServerNotModifiedV1_1}, &nameValue{Name: WarcRefersTo, Value: ""}, + &nameValue{Name: WarcRefersToTargetURI, Value: "http://example.com"}, + &nameValue{Name: WarcRefersToDate, Value: "2016-09-19T18:03:53Z"}, + &nameValue{Name: WarcTruncated, Value: "length"}, + }, + "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+ + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n"+ + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\n"), + []WarcRecord{createRecord1(Response, + &WarcFields{ + &nameValue{Name: WarcTargetURI, Value: "http://example.com"}, + &nameValue{Name: WarcDate, Value: "2016-09-19T18:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:6E9D6B234FEEBBF1AB618707217E577C3B83448A"}, + &nameValue{Name: ContentLength, Value: "236"}, + }, + "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+ + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02fff\"\n"+ + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content")}, + want{ + Response, + &WarcFields{ + &nameValue{Name: WarcTargetURI, Value: "http://example.com"}, + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: WarcType, Value: "response"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:6E9D6B234FEEBBF1AB618707217E577C3B83448A"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, + &nameValue{Name: ContentLength, Value: "257"}, + }, + "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" + + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" + + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content", + &httpResponseBlock{}, + true, + }, + false, + }, + { + "IdenticalPayloadDigest profile", + createRecord1(Revisit, + &WarcFields{ + &nameValue{Name: WarcTargetURI, Value: "http://foo.com"}, + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, + &nameValue{Name: ContentLength, Value: "238"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:BF9D96D3F3F230CE8E2C6A3E5E1D51A81016B55E"}, + &nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"}, + &nameValue{Name: WarcProfile, Value: ProfileIdenticalPayloadDigestV1_1}, + &nameValue{Name: WarcRefersTo, Value: ""}, + &nameValue{Name: WarcRefersToTargetURI, Value: "http://example.com"}, + &nameValue{Name: WarcRefersToDate, Value: "2016-09-19T18:03:53Z"}, &nameValue{Name: WarcTruncated, Value: "length"}, }, "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+ @@ -185,6 +347,7 @@ func Test_warcRecord_Merge(t *testing.T) { "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\n"), []WarcRecord{createRecord1(Response, &WarcFields{ + &nameValue{Name: WarcTargetURI, Value: "http://example.com"}, &nameValue{Name: WarcDate, Value: "2016-09-19T18:03:53Z"}, &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: ContentType, Value: "application/http;msgtype=response"}, @@ -197,6 +360,7 @@ func Test_warcRecord_Merge(t *testing.T) { want{ Response, &WarcFields{ + &nameValue{Name: WarcTargetURI, Value: "http://foo.com"}, &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, &nameValue{Name: WarcRecordID, Value: ""}, &nameValue{Name: WarcType, Value: "response"}, @@ -224,6 +388,12 @@ func Test_warcRecord_Merge(t *testing.T) { } }() + rr, err := tt.revisitRecord.RevisitRef() + assert.NoError(err) + revisitRef, err := tt.referencedRecord[0].CreateRevisitRef(rr.Profile) + assert.NoError(err) + assert.Equal(revisitRef, rr) + got, err := tt.revisitRecord.Merge(tt.referencedRecord...) if tt.wantErr { assert.Error(err) diff --git a/revisitblock.go b/revisitblock.go index 7cf6bb2..f264be7 100644 --- a/revisitblock.go +++ b/revisitblock.go @@ -79,6 +79,7 @@ func newRevisitBlock(opts *warcRecordOptions, src Block) (*revisitBlock, error) case HttpResponseBlock: block.headerBytes = v.HttpHeaderBytes() block.payloadDigestString = v.PayloadDigest() + case *genericBlock: default: return nil, fmt.Errorf("making revisit of %T not supported", v) }