Skip to content

Commit

Permalink
* Expose WarcVersion struct
Browse files Browse the repository at this point in the history
* Add WARC-Warcinfo-ID header to records when a warcinfo record is generated
  • Loading branch information
johnerikhalse committed Sep 2, 2021
1 parent dfc676c commit 246889b
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 46 deletions.
24 changes: 12 additions & 12 deletions headerfielddef.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const (
)

// validateHeader validates a WarcFields object as a WARC-record header
func validateHeader(wf *WarcFields, version *version, validation *Validation, opts *warcRecordOptions) (RecordType, error) {
func validateHeader(wf *WarcFields, version *WarcVersion, validation *Validation, opts *warcRecordOptions) (RecordType, error) {
rt, err := resolveRecordType(wf, validation, opts)
if err != nil {
return rt, err
Expand Down Expand Up @@ -156,7 +156,7 @@ var requiredFields = []string{WarcRecordID, ContentLength, WarcDate, WarcType}

type fieldDef struct {
name string
validationFunc func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (validatedValue string, err error)
validationFunc func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (validatedValue string, err error)
repeatable bool
supportedRec RecordType
supportedSpec uint8
Expand Down Expand Up @@ -249,16 +249,16 @@ func normalizeName(name string) (string, fieldDef) {
}

var (
pUnknown = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pUnknown = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
return value, nil
}
pString = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pString = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
return value, nil
}
pTime = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pTime = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
Expand All @@ -267,7 +267,7 @@ var (
}
return value, nil
}
pWarcType = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pWarcType = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
//if value != wr.RecordType.String() {
// return "", fmt.Errorf("not allowed to change record type")
//}
Expand All @@ -276,7 +276,7 @@ var (
}
return value, nil
}
pWarcId = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pWarcId = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
Expand All @@ -287,7 +287,7 @@ var (
//}
//return v, nil
}
pInt = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pInt = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
Expand All @@ -296,7 +296,7 @@ var (
}
return value, nil
}
pLong = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pLong = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
Expand All @@ -305,22 +305,22 @@ var (
}
return value, nil
}
pDigest = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pDigest = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
// TODO: Check Digest
return value, nil
}
pTruncReason = func(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (string, error) {
pTruncReason = func(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (string, error) {
if err := checkLegal(opts, name, value, version, recordType, def); err != nil {
return "", err
}
return value, nil
}
)

func checkLegal(opts *warcRecordOptions, name, value string, version *version, recordType RecordType, def fieldDef) (err error) {
func checkLegal(opts *warcRecordOptions, name, value string, version *WarcVersion, recordType RecordType, def fieldDef) (err error) {
// All fields are allowed for unknown record types
if recordType == 0 {
return
Expand Down
4 changes: 2 additions & 2 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package gowarc
import "github.com/nlnwa/gowarc/internal/diskbuffer"

type warcRecordOptions struct {
warcVersion *version
warcVersion *WarcVersion
errSyntax errorPolicy
errSpec errorPolicy
errUnknowRecordType errorPolicy
Expand Down Expand Up @@ -90,7 +90,7 @@ func newOptions(opts ...WarcRecordOption) *warcRecordOptions {

// WithVersion sets the WARC version to use for new records
// defaults to WARC/1.1
func WithVersion(version *version) WarcRecordOption {
func WithVersion(version *WarcVersion) WarcRecordOption {
return newFuncWarcRecordOption(func(o *warcRecordOptions) {
o.warcVersion = version
})
Expand Down
18 changes: 9 additions & 9 deletions record.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ const (
)

type WarcRecord interface {
Version() *version
Version() *WarcVersion
Type() RecordType
WarcHeader() *WarcFields
Block() Block
Expand All @@ -45,29 +45,29 @@ type WarcRecord interface {
Merge(record ...WarcRecord) (WarcRecord, error)
}

type version struct {
type WarcVersion struct {
id uint8
txt string
major uint8
minor uint8
}

func (v *version) String() string {
func (v *WarcVersion) String() string {
return "WARC/" + v.txt
}

func (v *version) Major() uint8 {
func (v *WarcVersion) Major() uint8 {
return v.major
}

func (v *version) Minor() uint8 {
func (v *WarcVersion) Minor() uint8 {
return v.minor
}

var (
// WARC versions
V1_0 = &version{id: 1, txt: "1.0", major: 1, minor: 0} // WARC 1.0
V1_1 = &version{id: 2, txt: "1.1", major: 1, minor: 1} // WARC 1.1
V1_0 = &WarcVersion{id: 1, txt: "1.0", major: 1, minor: 0} // WARC 1.0
V1_1 = &WarcVersion{id: 2, txt: "1.1", major: 1, minor: 1} // WARC 1.1
)

type RecordType uint16
Expand Down Expand Up @@ -151,14 +151,14 @@ const (

type warcRecord struct {
opts *warcRecordOptions
version *version
version *WarcVersion
headers *WarcFields
recordType RecordType
block Block
closer func() error
}

func (wr *warcRecord) Version() *version { return wr.version }
func (wr *warcRecord) Version() *WarcVersion { return wr.version }

func (wr *warcRecord) Type() RecordType { return wr.recordType }

Expand Down
2 changes: 1 addition & 1 deletion recordbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type WarcRecordBuilder interface {

type recordBuilder struct {
opts *warcRecordOptions
version *version
version *WarcVersion
headers *WarcFields
recordType RecordType
content diskbuffer.Buffer
Expand Down
6 changes: 3 additions & 3 deletions unmarshaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation
return record, offset, validation, err
}

func (u *unmarshaler) resolveRecordVersion(s string, validation *Validation) (*version, error) {
func (u *unmarshaler) resolveRecordVersion(s string, validation *Validation) (*WarcVersion, error) {
switch s {
case V1_0.txt:
return V1_0, nil
Expand All @@ -174,11 +174,11 @@ func (u *unmarshaler) resolveRecordVersion(s string, validation *Validation) (*v
switch u.opts.errSpec {
case ErrWarn:
validation.addError(fmt.Errorf("unsupported WARC version: %v", s))
return &version{txt: s}, nil
return &WarcVersion{txt: s}, nil
case ErrFail:
return nil, fmt.Errorf("unsupported WARC version: %v", s)
default:
return &version{txt: s}, nil
return &WarcVersion{txt: s}, nil
}
}
}
2 changes: 1 addition & 1 deletion unmarshaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (

func Test_unmarshaler_Unmarshal(t *testing.T) {
type want struct {
version *version
version *WarcVersion
recordType RecordType
headers *WarcFields
blockType interface{}
Expand Down
23 changes: 16 additions & 7 deletions warcfile.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,12 @@ func (w *WarcFileWriter) Shutdown() error {
}

type singleWarcFileWriter struct {
opts *warcFileWriterOptions
currentFileName string
currentFile *os.File
currentFileSize int64
writeLock sync.Mutex
opts *warcFileWriterOptions
currentFileName string
currentFile *os.File
currentFileSize int64
currentWarcInfoId string
writeLock sync.Mutex
}

func (w *singleWarcFileWriter) Write(record WarcRecord) (int64, string, int64, error) {
Expand Down Expand Up @@ -240,7 +241,9 @@ func (w *singleWarcFileWriter) writeRecord(writer io.Writer, record WarcRecord,
defer func() { _ = gz.Close() }()
writer = gz
}

if w.currentWarcInfoId != "" {
record.WarcHeader().Set(WarcWarcinfoID, w.currentWarcInfoId)
}
nextRec, size, err := w.opts.marshaler.Marshal(writer, record, maxRecordSize)
if err != nil {
return size, err
Expand All @@ -267,10 +270,12 @@ func (w *singleWarcFileWriter) createWarcInfoRecord(fileName string) (int64, err
if err != nil {
return 0, err
}
w.currentWarcInfoId = ""
n, err := w.writeRecord(w.currentFile, warcinfo, 0)
if err != nil {
return 0, err
}
w.currentWarcInfoId = warcinfo.WarcHeader().Get(WarcRecordID)
// sync file to reduce possibility of half written records in case of crash
if err := w.currentFile.Sync(); err != nil {
return 0, err
Expand Down Expand Up @@ -491,7 +496,11 @@ func WithExpectedCompressionRatio(ratio float64) WarcFileWriterOption {
// WithWarcInfoFunc sets a warcinfo-record generator function to be called for every new WARC-file created.
// The function receives a WarcRecordBuilder which is prepopulated with WARC-Record-ID, WARC-Type, WARC-Date and Content-Type.
// After the submitted function returns, Content-Length and WARC-Block-Digest fields are calculated.
// defaults nil
//
// When this option is set, records written to the warcfile will have the WARC-Warcinfo-ID automatically set to point
// to the generated warcinfo record.
//
// defaults nil (no generation of warcinfo record)
func WithWarcInfoFunc(f func(recordBuilder WarcRecordBuilder) error) WarcFileWriterOption {
return newFuncWarcFileOption(func(o *warcFileWriterOptions) {
o.warcInfoFunc = f
Expand Down
24 changes: 13 additions & 11 deletions warcfile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ import (
"time"
)

const (
uncompressedRecordSize int64 = 529
compressedRecordSize int64 = 392
uncompressedRecordWithWarcInfoIdSize int64 = 596
compressedRecordWithWarcInfoIdSize int64 = 428
)

func TestWarcFileWriter_Write_uncompressed(t *testing.T) {
now = func() time.Time {
return time.Date(2001, 9, 12, 5, 30, 20, 0, time.UTC)
Expand Down Expand Up @@ -121,14 +128,14 @@ func TestWarcFileWriter_Write_warcinfo_uncompressed(t *testing.T) {
uncompressedWarcinfoSize := int64(316)
offset, fileName, size, err := w.Write(createTestRecord())
assert.NoError(err)
assert.Equalf(uncompressedRecordSize, size, "Expected size from writer %d, but was %d", uncompressedRecordSize, size)
assert.Equalf(uncompressedRecordWithWarcInfoIdSize, size, "Expected size from writer %d, but was %d", uncompressedRecordWithWarcInfoIdSize, size)
assert.Equalf(uncompressedWarcinfoSize, offset, "Expected offset from writer %d, but was %d", uncompressedWarcinfoSize, offset)
assert.Equal("foo-20010912053020-0001-10.10.10.10.warc", fileName)

offset, fileName, size, err = w.Write(createTestRecord())
assert.NoError(err)
assert.Equalf(uncompressedRecordSize, size, "Expected size from writer %d, but was %d", uncompressedRecordSize, size)
assert.Equalf(uncompressedWarcinfoSize+uncompressedRecordSize, offset, "Expected offset from writer %d, but was %d", uncompressedWarcinfoSize+uncompressedRecordSize, offset)
assert.Equalf(uncompressedRecordWithWarcInfoIdSize, size, "Expected size from writer %d, but was %d", uncompressedRecordWithWarcInfoIdSize, size)
assert.Equalf(uncompressedWarcinfoSize+uncompressedRecordWithWarcInfoIdSize, offset, "Expected offset from writer %d, but was %d", uncompressedWarcinfoSize+uncompressedRecordWithWarcInfoIdSize, offset)
assert.Equal("foo-20010912053020-0001-10.10.10.10.warc", fileName)

// Close writer
Expand Down Expand Up @@ -161,14 +168,14 @@ func TestWarcFileWriter_Write_warcinfo_compressed(t *testing.T) {
compressedWarcinfoSize := int64(257)
offset, fileName, size, err := w.Write(createTestRecord())
assert.NoError(err)
assert.Equalf(uncompressedRecordSize, size, "Expected size from writer %d, but was %d", uncompressedRecordSize, size)
assert.Equalf(uncompressedRecordWithWarcInfoIdSize, size, "Expected size from writer %d, but was %d", uncompressedRecordWithWarcInfoIdSize, size)
assert.Equalf(compressedWarcinfoSize, offset, "Expected offset from writer %d, but was %d", compressedWarcinfoSize, offset)
assert.Equal("foo-20010912053020-0001-10.10.10.10.warc.gz", fileName)

offset, fileName, size, err = w.Write(createTestRecord())
assert.NoError(err)
assert.Equalf(uncompressedRecordSize, size, "Expected size from writer %d, but was %d", uncompressedRecordSize, size)
assert.Equalf(compressedWarcinfoSize+compressedRecordSize, offset, "Expected offset from writer %d, but was %d", compressedWarcinfoSize+compressedRecordSize, offset)
assert.Equalf(uncompressedRecordWithWarcInfoIdSize, size, "Expected size from writer %d, but was %d", uncompressedRecordWithWarcInfoIdSize, size)
assert.Equalf(compressedWarcinfoSize+compressedRecordWithWarcInfoIdSize, offset, "Expected offset from writer %d, but was %d", compressedWarcinfoSize+compressedRecordWithWarcInfoIdSize, offset)
assert.Equal("foo-20010912053020-0001-10.10.10.10.warc.gz", fileName)

// Close writer
Expand Down Expand Up @@ -378,11 +385,6 @@ func TestWarcFileWriter_Write(t *testing.T) {
}
}

const (
uncompressedRecordSize int64 = 529
compressedRecordSize int64 = 392
)

func createTestRecord() WarcRecord {
builder := NewRecordBuilder(Response, WithFixDigest(false), WithStrictValidation())
_, err := builder.WriteString("HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
Expand Down

0 comments on commit 246889b

Please sign in to comment.