Skip to content

Commit

Permalink
Merge pull request #48 from nlnwa/revisit
Browse files Browse the repository at this point in the history
Support making revisit records from resource records
  • Loading branch information
maeb authored Oct 22, 2021
2 parents eb23d67 + 0b88479 commit 24db9cc
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 4 deletions.
35 changes: 31 additions & 4 deletions record.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,15 @@ type WarcRecord interface {
Block() Block
String() string
io.Closer
// ToRevisitRecord takes RevisitRef referencing the record we want to make a revisit of and returns a revisit record.
ToRevisitRecord(ref *RevisitRef) (WarcRecord, error)
// RevisitRef extracts a RevisitRef current record if it is a revisit record.
RevisitRef() (*RevisitRef, error)
// CreateRevisitRef creates a RevisitRef which references the current record.
// The RevisitRef might be used by another records ToRevisitRecord to create a revisit record referencing this record.
CreateRevisitRef(profile string) (*RevisitRef, error)
// Merge merges this record with its referenced record(s)
// It is implemented only for revisit records, but this function will be enhanced to also support segmented records.
Merge(record ...WarcRecord) (WarcRecord, error)
ValidateDigest(validation *Validation) error
}
Expand Down Expand Up @@ -191,7 +198,10 @@ func (wr *warcRecord) ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) {
case ProfileIdenticalPayloadDigestV1_0:
fallthrough
case ProfileIdenticalPayloadDigestV1_1:
if !wr.headers.Has(WarcPayloadDigest) {
if !h.Has(WarcPayloadDigest) && wr.recordType == Resource && h.Has(WarcBlockDigest) {
h.Set(WarcPayloadDigest, h.Get(WarcBlockDigest))
}
if !h.Has(WarcPayloadDigest) {
return nil, fmt.Errorf("payload digest is required for Identical Payload Digest Profile")
}
case ProfileServerNotModifiedV1_0:
Expand All @@ -218,7 +228,6 @@ func (wr *warcRecord) ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) {
return nil, err
}
h.Set(WarcBlockDigest, block.BlockDigest())
h.Set(WarcPayloadDigest, block.PayloadDigest())
h.Set(ContentLength, strconv.Itoa(len(block.headerBytes)))

revisit := &warcRecord{
Expand All @@ -244,6 +253,19 @@ func (wr *warcRecord) RevisitRef() (*RevisitRef, error) {
}, nil
}

func (wr *warcRecord) CreateRevisitRef(profile string) (*RevisitRef, error) {
if wr.recordType == Revisit {
return nil, fmt.Errorf("not allowed to reference a revisit record")
}

return &RevisitRef{
Profile: profile,
TargetRecordId: wr.headers.Get(WarcRecordID),
TargetUri: wr.headers.Get(WarcTargetURI),
TargetDate: wr.headers.Get(WarcDate),
}, nil
}

func (wr *warcRecord) Merge(record ...WarcRecord) (WarcRecord, error) {
if wr.headers.Get(WarcSegmentNumber) == "1" {
return nil, fmt.Errorf("merging of segmentet records is not implemented")
Expand Down Expand Up @@ -356,6 +378,9 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {
switch v := wr.Block().(type) {
case *genericBlock:
blockDigest = v.blockDigest
if wr.recordType == Resource {
payloadDigest = blockDigest
}
case *httpRequestBlock:
blockDigest = v.blockDigest
payloadDigest = v.payloadDigest
Expand All @@ -370,7 +395,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {

if blockDigest != nil {
if blockDigest.hash == "" {
// Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added.
// Missing digest header is allowed, so skip validation. But if addMissingDigest option is set, a header will be added.
if wr.opts.addMissingDigest {
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())
}
Expand All @@ -381,6 +406,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {
case ErrWarn:
validation.addError(fmt.Errorf("block: %w", err))
if wr.opts.fixDigest {
// Digest validation failed. But if fixDigest option is set, the calculated digest will be set.
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())
}
case ErrFail:
Expand All @@ -398,7 +424,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {

if payloadDigest != nil {
if payloadDigest.hash == "" {
// Missing digest header is allowed, so skip validation. But if fixDigest option is set, a header will be added.
// Missing digest header is allowed, so skip validation. But if addMissingDigest option is set, a header will be added.
if wr.opts.addMissingDigest {
wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format())
}
Expand All @@ -408,6 +434,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {
case ErrIgnore:
case ErrWarn:
validation.addError(fmt.Errorf("payload: %w", err))
// Digest validation failed. But if fixDigest option is set, the calculated digest will be set.
if wr.opts.fixDigest {
wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format())
}
Expand Down
170 changes: 170 additions & 0 deletions record_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func Test_warcRecord_ToRevisitRecord(t *testing.T) {
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"},
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentLength, Value: "257"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+
Expand All @@ -68,6 +69,38 @@ func Test_warcRecord_ToRevisitRecord(t *testing.T) {
},
false,
},
{
"ServerNotModified profile missing payload digest\"",
createRecord1(Response,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"},
&nameValue{Name: ContentLength, Value: "257"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n"+
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"),
&RevisitRef{Profile: ProfileServerNotModifiedV1_1, TargetRecordId: "targetId"},
want{
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "revisit"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:BF9D96D3F3F230CE8E2C6A3E5E1D51A81016B55E"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: ContentLength, Value: "238"},
&nameValue{Name: WarcProfile, Value: ProfileServerNotModifiedV1_1},
&nameValue{Name: WarcRefersTo, Value: "targetId"},
&nameValue{Name: WarcTruncated, Value: "length"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\n",
},
false,
},
{
"IdenticalPayloadDigest profile",
createRecord1(Response,
Expand Down Expand Up @@ -119,6 +152,80 @@ func Test_warcRecord_ToRevisitRecord(t *testing.T) {
want{},
true,
},
{
"IdenticalPayloadDigest profile resource record missing payload digest",
createRecord1(Resource,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "text/plain"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentLength, Value: "19"},
},
"This is the content"),
&RevisitRef{Profile: ProfileIdenticalPayloadDigestV1_1, TargetRecordId: "targetId"},
want{
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "revisit"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"},
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentType, Value: "text/plain"},
&nameValue{Name: ContentLength, Value: "0"},
&nameValue{Name: WarcProfile, Value: ProfileIdenticalPayloadDigestV1_1},
&nameValue{Name: WarcRefersTo, Value: "targetId"},
&nameValue{Name: WarcTruncated, Value: "length"},
},
"",
},
false,
},
{
"IdenticalPayloadDigest profile metadata record",
createRecord1(Resource,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "text/plain"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentLength, Value: "19"},
},
"This is the content"),
&RevisitRef{Profile: ProfileIdenticalPayloadDigestV1_1, TargetRecordId: "targetId"},
want{
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "revisit"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"},
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentType, Value: "text/plain"},
&nameValue{Name: ContentLength, Value: "0"},
&nameValue{Name: WarcProfile, Value: ProfileIdenticalPayloadDigestV1_1},
&nameValue{Name: WarcRefersTo, Value: "targetId"},
&nameValue{Name: WarcTruncated, Value: "length"},
},
"",
},
false,
},
{
"IdenticalPayloadDigest profile metadata record missing payload digest",
createRecord1(Metadata,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "text/plain"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentLength, Value: "19"},
},
"This is the content"),
&RevisitRef{Profile: ProfileIdenticalPayloadDigestV1_1, TargetRecordId: "targetId"},
want{},
true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down Expand Up @@ -170,6 +277,7 @@ func Test_warcRecord_Merge(t *testing.T) {
"ServerNotModified profile",
createRecord1(Revisit,
&WarcFields{
&nameValue{Name: WarcTargetURI, Value: "http://example.com"},
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
Expand All @@ -178,13 +286,68 @@ func Test_warcRecord_Merge(t *testing.T) {
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: WarcProfile, Value: ProfileServerNotModifiedV1_1},
&nameValue{Name: WarcRefersTo, Value: "<urn:uuid:fff0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcRefersToTargetURI, Value: "http://example.com"},
&nameValue{Name: WarcRefersToDate, Value: "2016-09-19T18:03:53Z"},
&nameValue{Name: WarcTruncated, Value: "length"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n"+
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\n"),
[]WarcRecord{createRecord1(Response,
&WarcFields{
&nameValue{Name: WarcTargetURI, Value: "http://example.com"},
&nameValue{Name: WarcDate, Value: "2016-09-19T18:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:fff0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:6E9D6B234FEEBBF1AB618707217E577C3B83448A"},
&nameValue{Name: ContentLength, Value: "236"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02fff\"\n"+
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content")},
want{
Response,
&WarcFields{
&nameValue{Name: WarcTargetURI, Value: "http://example.com"},
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "response"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:6E9D6B234FEEBBF1AB618707217E577C3B83448A"},
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: ContentLength, Value: "257"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content",
&httpResponseBlock{},
true,
},
false,
},
{
"IdenticalPayloadDigest profile",
createRecord1(Revisit,
&WarcFields{
&nameValue{Name: WarcTargetURI, Value: "http://foo.com"},
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
&nameValue{Name: ContentLength, Value: "238"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:BF9D96D3F3F230CE8E2C6A3E5E1D51A81016B55E"},
&nameValue{Name: WarcPayloadDigest, Value: "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"},
&nameValue{Name: WarcProfile, Value: ProfileIdenticalPayloadDigestV1_1},
&nameValue{Name: WarcRefersTo, Value: "<urn:uuid:fff0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcRefersToTargetURI, Value: "http://example.com"},
&nameValue{Name: WarcRefersToDate, Value: "2016-09-19T18:03:53Z"},
&nameValue{Name: WarcTruncated, Value: "length"},
},
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n"+
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n"+
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\n"),
[]WarcRecord{createRecord1(Response,
&WarcFields{
&nameValue{Name: WarcTargetURI, Value: "http://example.com"},
&nameValue{Name: WarcDate, Value: "2016-09-19T18:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:fff0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
Expand All @@ -197,6 +360,7 @@ func Test_warcRecord_Merge(t *testing.T) {
want{
Response,
&WarcFields{
&nameValue{Name: WarcTargetURI, Value: "http://foo.com"},
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "response"},
Expand Down Expand Up @@ -224,6 +388,12 @@ func Test_warcRecord_Merge(t *testing.T) {
}
}()

rr, err := tt.revisitRecord.RevisitRef()
assert.NoError(err)
revisitRef, err := tt.referencedRecord[0].CreateRevisitRef(rr.Profile)
assert.NoError(err)
assert.Equal(revisitRef, rr)

got, err := tt.revisitRecord.Merge(tt.referencedRecord...)
if tt.wantErr {
assert.Error(err)
Expand Down
1 change: 1 addition & 0 deletions revisitblock.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func newRevisitBlock(opts *warcRecordOptions, src Block) (*revisitBlock, error)
case HttpResponseBlock:
block.headerBytes = v.HttpHeaderBytes()
block.payloadDigestString = v.PayloadDigest()
case *genericBlock:
default:
return nil, fmt.Errorf("making revisit of %T not supported", v)
}
Expand Down

0 comments on commit 24db9cc

Please sign in to comment.