From 3c8ec727477ce04926eabf11f9629996c2fe0c30 Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Thu, 7 Nov 2024 20:16:32 -0700 Subject: [PATCH] Finish improvements (probably) --- .github/workflows/macos-latest.yml | 2 +- .github/workflows/ubuntu-latest.yml | 2 +- .github/workflows/windows-latest.yml | 2 +- 7z.go | 8 +- README.md | 16 +- archiver.go | 2 +- archiver_test.go | 3 +- brotli.go | 19 +- bz2.go | 7 +- formats.go | 117 +++-- formats_test.go | 111 ++-- fs.go | 724 ++++++++++++--------------- fs_test.go | 17 +- go.mod | 14 +- go.sum | 26 +- gz.go | 7 +- interfaces.go | 20 +- lz4.go | 7 +- lzip.go | 7 +- rar.go | 8 +- sz.go | 7 +- tar.go | 14 +- xz.go | 7 +- zip.go | 8 +- zlib.go | 7 +- zstd.go | 7 +- 26 files changed, 568 insertions(+), 601 deletions(-) diff --git a/.github/workflows/macos-latest.yml b/.github/workflows/macos-latest.yml index 70b0aa57..5f2bdf3d 100644 --- a/.github/workflows/macos-latest.yml +++ b/.github/workflows/macos-latest.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - go-version: [1.22] + go-version: [1.23] runs-on: macos-latest steps: - name: Install Go diff --git a/.github/workflows/ubuntu-latest.yml b/.github/workflows/ubuntu-latest.yml index 0502d0d8..d25b72d2 100644 --- a/.github/workflows/ubuntu-latest.yml +++ b/.github/workflows/ubuntu-latest.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - go-version: [1.22] + go-version: [1.23] runs-on: ubuntu-latest steps: - name: Install Go diff --git a/.github/workflows/windows-latest.yml b/.github/workflows/windows-latest.yml index d27e28d9..b53e3eed 100644 --- a/.github/workflows/windows-latest.yml +++ b/.github/workflows/windows-latest.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - go-version: [1.22] + go-version: [1.23] runs-on: windows-latest steps: - name: Install Go diff --git a/7z.go b/7z.go index 61e07570..4a3dbd4a 100644 --- a/7z.go +++ b/7z.go @@ -31,13 +31,13 @@ type SevenZip struct { Password string } -func (z SevenZip) Name() string { return ".7z" } +func (z SevenZip) Extension() string { return ".7z" } -func (z SevenZip) Match(filename string, stream io.Reader) (MatchResult, error) { +func (z SevenZip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), z.Name()) { + if strings.Contains(strings.ToLower(filename), z.Extension()) { mr.ByName = true } @@ -104,7 +104,7 @@ func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInA if err != nil { return nil, err } - return archivedFile{openedFile, fi}, nil + return fileInArchive{openedFile, fi}, nil }, } diff --git a/README.md b/README.md index 2d7dcf7c..9b1cdc78 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # archiver [![Go Reference](https://pkg.go.dev/badge/github.com/mholt/archiver/v4.svg)](https://pkg.go.dev/github.com/mholt/archiver/v4) [![Ubuntu-latest](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml) [![Macos-latest](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml) [![Windows-latest](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml) -Introducing **Archiver 4.0** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities. +Introducing **Archiver 4.0 (alpha)** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities. **:warning: v4 is in ALPHA. The core library APIs work pretty well but the command has not been implemented yet, nor have most automated tests. If you need the `arc` command, stick with v3 for now.** @@ -11,8 +11,8 @@ Introducing **Archiver 4.0** - a cross-platform, multi-format archive utility an - By file name - By header - Traverse directories, archive files, and any other file uniformly as [`io/fs`](https://pkg.go.dev/io/fs) file systems: - - [`DirFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#DirFS) - [`FileFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#FileFS) + - [`DirFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#DirFS) - [`ArchiveFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#ArchiveFS) - Compress and decompress files - Create and extract archive files @@ -117,7 +117,7 @@ If you want all the files, pass in a nil list of file paths. ```go // the type that will be used to read the input stream -format := archiver.Zip{} +var format archiver.Zip // the list of files we want out of the archive; any // directories will include all their contents unless @@ -141,7 +141,7 @@ if err != nil { Have an input stream with unknown contents? No problem, archiver can identify it for you. It will try matching based on filename and/or the header (which peeks at the stream): ```go -format, input, err := archiver.Identify("filename.tar.zst", input) +format, input, err := archiver.Identify(ctx, "filename.tar.zst", input) if err != nil { return err } @@ -165,7 +165,7 @@ if decom, ok := format.(archiver.Decompressor); ok { } ``` -`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew. +`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew. If your input stream is `io.Seeker` however, no buffer is created (it uses `Seek()` instead). ### Virtual file systems @@ -212,7 +212,7 @@ if dir, ok := f.(fs.ReadDirFile); ok { return err } for _, e := range entries { - fmt.Println(e.Name()) + fmt.Println(e.Extension()) } } ``` @@ -225,7 +225,7 @@ if err != nil { return err } for _, e := range entries { - fmt.Println(e.Name()) + fmt.Println(e.Extension()) } ``` @@ -247,6 +247,8 @@ if err != nil { } ``` +**Important .tar note:** Tar files do not efficiently implement file system semantics due to their roots in sequential-access design for tapes. File systems inherently assume random access, but tar files need to be read from the beginning to access something at the end. This is especially slow when the archive is compressed. Optimizations have been implemented to amortize `ReadDir()` calls so that `fs.WalkDir()` only has to scan the archive once, but they use more memory. Open calls require another scan to find the file. It may be more efficient to use `Tar.Extract()` directly if file system semantics are not important to you. + #### Use with `http.FileServer` It can be used with http.FileServer to browse archives and directories in a browser. However, due to how http.FileServer works, don't directly use http.FileServer with compressed files; instead wrap it like following: diff --git a/archiver.go b/archiver.go index cc74af47..7e68f30d 100644 --- a/archiver.go +++ b/archiver.go @@ -219,7 +219,7 @@ type FromDiskOptions struct { // memory, and skipping lots of directories may run up your memory bill. // // Any other returned error will terminate a walk and be returned to the caller. -type FileHandler func(ctx context.Context, f FileInfo) error +type FileHandler func(ctx context.Context, info FileInfo) error // openAndCopyFile opens file for reading, copies its // contents to w, then closes file. diff --git a/archiver_test.go b/archiver_test.go index e4355540..0cf8124c 100644 --- a/archiver_test.go +++ b/archiver_test.go @@ -245,7 +245,8 @@ func TestNameOnDiskToNameInArchive(t *testing.T) { }, } { if !strings.HasPrefix(tc.nameOnDisk, tc.rootOnDisk) { - t.Fatalf("Test %d: Invalid test case! Filename (on disk) will have rootOnDisk as a prefix according to the fs.WalkDirFunc godoc.", i) + t.Errorf("Test %d: Invalid test case! Filename (on disk) will have rootOnDisk as a prefix according to the fs.WalkDirFunc godoc.", i) + continue } if tc.windows && runtime.GOOS != "windows" { t.Logf("Test %d: Skipping test that is only compatible with Windows", i) diff --git a/brotli.go b/brotli.go index 5d17fae7..c650f40e 100644 --- a/brotli.go +++ b/brotli.go @@ -1,6 +1,7 @@ package archiver import ( + "context" "io" "strings" @@ -16,19 +17,25 @@ type Brotli struct { Quality int } -func (Brotli) Name() string { return ".br" } +func (Brotli) Extension() string { return ".br" } -func (br Brotli) Match(filename string, stream io.Reader) (MatchResult, error) { +func (br Brotli) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), br.Name()) { + if strings.Contains(strings.ToLower(filename), br.Extension()) { mr.ByName = true } - // brotli does not have well-defined file headers; the - // best way to match the stream would be to try decoding - // part of it, and this is not implemented for now + // brotli does not have well-defined file headers or a magic number; + // the best way to match the stream is probably to try decoding part + // of it, but we'll just have to guess a large-enough size that is + // still small enough for the smallest streams we'll encounter + r := brotli.NewReader(stream) + buf := make([]byte, 16) + if _, err := io.ReadFull(r, buf); err == nil { + mr.ByStream = true + } return mr, nil } diff --git a/bz2.go b/bz2.go index 57a278f4..a2a5f05e 100644 --- a/bz2.go +++ b/bz2.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -17,13 +18,13 @@ type Bz2 struct { CompressionLevel int } -func (Bz2) Name() string { return ".bz2" } +func (Bz2) Extension() string { return ".bz2" } -func (bz Bz2) Match(filename string, stream io.Reader) (MatchResult, error) { +func (bz Bz2) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), bz.Name()) { + if strings.Contains(strings.ToLower(filename), bz.Extension()) { mr.ByName = true } diff --git a/formats.go b/formats.go index 2ec41935..24865fea 100644 --- a/formats.go +++ b/formats.go @@ -12,7 +12,7 @@ import ( // RegisterFormat registers a format. It should be called during init. // Duplicate formats by name are not allowed and will panic. func RegisterFormat(format Format) { - name := strings.Trim(strings.ToLower(format.Name()), ".") + name := strings.Trim(strings.ToLower(format.Extension()), ".") if _, ok := formats[name]; ok { panic("format " + name + " is already registered") } @@ -32,14 +32,21 @@ func RegisterFormat(format Format) { // // If stream is non-nil then the returned io.Reader will always be // non-nil and will read from the same point as the reader which was -// passed in; it should be used in place of the input stream after +// passed in. If the input stream is not an io.Seeker, the returned +// io.Reader value should be used in place of the input stream after // calling Identify() because it preserves and re-reads the bytes that // were already read during the identification process. -func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { +// +// If the input stream is an io.Seeker, Seek() must work, and the +// original input value will be returned instead of a wrapper value. +func Identify(ctx context.Context, filename string, stream io.Reader) (Format, io.Reader, error) { var compression Compression var archival Archival - rewindableStream := newRewindReader(stream) + rewindableStream, err := newRewindReader(stream) + if err != nil { + return nil, nil, err + } // try compression format first, since that's the outer "layer" for name, format := range formats { @@ -48,7 +55,7 @@ func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { continue } - matchResult, err := identifyOne(format, filename, rewindableStream, nil) + matchResult, err := identifyOne(ctx, format, filename, rewindableStream, nil) if err != nil { return nil, rewindableStream.reader(), fmt.Errorf("matching %s: %w", name, err) } @@ -68,7 +75,7 @@ func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { continue } - matchResult, err := identifyOne(format, filename, rewindableStream, compression) + matchResult, err := identifyOne(ctx, format, filename, rewindableStream, compression) if err != nil { return nil, rewindableStream.reader(), fmt.Errorf("matching %s: %w", name, err) } @@ -89,13 +96,17 @@ func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { case compression != nil && archival != nil: return CompressedArchive{compression, archival}, bufferedStream, nil default: - return nil, bufferedStream, ErrNoMatch + return nil, bufferedStream, NoMatch } } -func identifyOne(format Format, filename string, stream *rewindReader, comp Compression) (mr MatchResult, err error) { +func identifyOne(ctx context.Context, format Format, filename string, stream *rewindReader, comp Compression) (mr MatchResult, err error) { defer stream.rewind() + if filename == "." { + filename = "" + } + // if looking within a compressed format, wrap the stream in a // reader that can decompress it so we can match the "inner" format // (yes, we have to make a new reader every time we do a match, @@ -107,14 +118,14 @@ func identifyOne(format Format, filename string, stream *rewindReader, comp Comp return MatchResult{}, openErr } defer decompressedStream.Close() - mr, err = format.Match(filename, decompressedStream) + mr, err = format.Match(ctx, filename, decompressedStream) } else { // Make sure we pass a nil io.Reader not a *rewindReader(nil) var r io.Reader if stream != nil { r = stream } - mr, err = format.Match(filename, r) + mr, err = format.Match(ctx, filename, r) } // if the error is EOF, we can just ignore it. @@ -168,26 +179,26 @@ type CompressedArchive struct { // Name returns a concatenation of the archive format name // and the compression format name. -func (caf CompressedArchive) Name() string { +func (caf CompressedArchive) Extension() string { if caf.Compression == nil && caf.Archival == nil { panic("missing both compression and archive formats") } var name string if caf.Archival != nil { - name += caf.Archival.Name() + name += caf.Archival.Extension() } if caf.Compression != nil { - name += caf.Compression.Name() + name += caf.Compression.Extension() } return name } // Match matches if the input matches both the compression and archive format. -func (caf CompressedArchive) Match(filename string, stream io.Reader) (MatchResult, error) { +func (caf CompressedArchive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) { var conglomerate MatchResult if caf.Compression != nil { - matchResult, err := caf.Compression.Match(filename, stream) + matchResult, err := caf.Compression.Match(ctx, filename, stream) if err != nil { return MatchResult{}, err } @@ -208,7 +219,7 @@ func (caf CompressedArchive) Match(filename string, stream io.Reader) (MatchResu } if caf.Archival != nil { - matchResult, err := caf.Archival.Match(filename, stream) + matchResult, err := caf.Archival.Match(ctx, filename, stream) if err != nil { return MatchResult{}, err } @@ -239,7 +250,7 @@ func (caf CompressedArchive) Archive(ctx context.Context, output io.Writer, file func (caf CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error { do, ok := caf.Archival.(ArchiverAsync) if !ok { - return fmt.Errorf("%s archive does not support async writing", caf.Name()) + return fmt.Errorf("%s archive does not support async writing", caf.Extension()) } if caf.Compression != nil { wc, err := caf.Compression.OpenWriter(output) @@ -253,27 +264,13 @@ func (caf CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, } // Extract reads files out of an archive while decompressing the results. -// If Extract is not called from ArchiveFS.Open, then the FileHandler passed -// in must close all opened files by the time the Extract walk finishes. func (caf CompressedArchive) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { if caf.Compression != nil { rc, err := caf.Compression.OpenReader(sourceArchive) if err != nil { return err } - // I don't like this solution, but we have to close the decompressor. - // The problem is that if we simply defer rc.Close(), we potentially - // close it before the caller is done using files it opened. Ideally - // it should be closed when the sourceArchive is also closed. But since - // we don't originate sourceArchive, we can't close it when it closes. - // The best I can think of for now is this hack where we tell a type - // that supports this to close another reader when itself closes. - // See issue #365. - if cc, ok := sourceArchive.(compressorCloser); ok { - cc.closeCompressor(rc) - } else { - defer rc.Close() - } + defer rc.Close() sourceArchive = rc } return caf.Archival.Extract(ctx, sourceArchive, pathsInArchive, handleFile) @@ -299,26 +296,42 @@ func (mr MatchResult) Matched() bool { return mr.ByName || mr.ByStream } // read from the stream. This is useful for "peeking" a stream an // arbitrary number of bytes. Loosely based on the Connection type // from https://github.com/mholt/caddy-l4. +// +// If the reader is also an io.Seeker, no buffer is used, and instead +// the stream seeks back to the starting position. type rewindReader struct { io.Reader + start int64 buf *bytes.Buffer bufReader io.Reader } -func newRewindReader(r io.Reader) *rewindReader { +func newRewindReader(r io.Reader) (*rewindReader, error) { if r == nil { - return nil + return nil, nil } - return &rewindReader{ - Reader: r, - buf: new(bytes.Buffer), + + rr := &rewindReader{Reader: r} + + // avoid buffering if we have a seeker we can use + if seeker, ok := r.(io.Seeker); ok { + var err error + rr.start, err = seeker.Seek(0, io.SeekCurrent) + if err != nil { + return nil, fmt.Errorf("seek to determine current position: %w", err) + } + } else { + rr.buf = new(bytes.Buffer) } + + return rr, nil } func (rr *rewindReader) Read(p []byte) (n int, err error) { if rr == nil { - panic("internal error: reading from nil rewindReader") + panic("reading from nil rewindReader") } + // if there is a buffer we should read from, start // with that; we only read from the underlying stream // after the buffer has been "depleted" @@ -333,13 +346,13 @@ func (rr *rewindReader) Read(p []byte) (n int, err error) { } } - // buffer has been "depleted" so read from - // underlying connection + // buffer has been depleted or we are not using one, + // so read from underlying stream nr, err := rr.Reader.Read(p[n:]) // anything that was read needs to be written to - // the buffer, even if there was an error - if nr > 0 { + // the buffer (if used), even if there was an error + if nr > 0 && rr.buf != nil { if nw, errw := rr.buf.Write(p[n : n+nr]); errw != nil { return nw, errw } @@ -355,18 +368,24 @@ func (rr *rewindReader) Read(p []byte) (n int, err error) { // rewind resets the stream to the beginning by causing // Read() to start reading from the beginning of the -// buffered bytes. +// stream, or, if buffering, the buffered bytes. func (rr *rewindReader) rewind() { if rr == nil { return } + if ras, ok := rr.Reader.(io.Seeker); ok { + if _, err := ras.Seek(rr.start, io.SeekStart); err == nil { + return + } + } rr.bufReader = bytes.NewReader(rr.buf.Bytes()) } // reader returns a reader that reads first from the buffered -// bytes, then from the underlying stream. After calling this, -// no more rewinding is allowed since reads from the stream are -// not recorded, so rewinding properly is impossible. +// bytes (if buffering), then from the underlying stream; if a +// Seeker, the stream will be seeked back to the start. After +// calling this, no more rewinding is allowed since reads from +// the stream are not recorded, so rewinding properly is impossible. // If the underlying reader implements io.Seeker, then the // underlying reader will be used directly. func (rr *rewindReader) reader() io.Reader { @@ -374,15 +393,15 @@ func (rr *rewindReader) reader() io.Reader { return nil } if ras, ok := rr.Reader.(io.Seeker); ok { - if _, err := ras.Seek(0, io.SeekStart); err == nil { + if _, err := ras.Seek(rr.start, io.SeekStart); err == nil { return rr.Reader } } return io.MultiReader(bytes.NewReader(rr.buf.Bytes()), rr.Reader) } -// ErrNoMatch is returned if there are no matching formats. -var ErrNoMatch = fmt.Errorf("no formats matched") +// NoMatch is a special error returned if there are no matching formats. +var NoMatch = fmt.Errorf("no formats matched") // Registered formats. var formats = make(map[string]Format) diff --git a/formats_test.go b/formats_test.go index e89d5617..6c8d621f 100644 --- a/formats_test.go +++ b/formats_test.go @@ -16,7 +16,10 @@ import ( func TestRewindReader(t *testing.T) { data := "the header\nthe body\n" - r := newRewindReader(strings.NewReader(data)) + r, err := newRewindReader(strings.NewReader(data)) + if err != nil { + t.Errorf("creating rewindReader: %v", err) + } buf := make([]byte, 10) // enough for 'the header' @@ -25,10 +28,10 @@ func TestRewindReader(t *testing.T) { r.rewind() n, err := r.Read(buf) if err != nil { - t.Fatalf("Read failed: %s", err) + t.Errorf("Read failed: %s", err) } if string(buf[:n]) != "the header" { - t.Fatalf("iteration %d: expected 'the header' but got '%s' (n=%d)", i, string(buf[:n]), n) + t.Errorf("iteration %d: expected 'the header' but got '%s' (n=%d)", i, string(buf[:n]), n) } } @@ -38,10 +41,10 @@ func TestRewindReader(t *testing.T) { buf = make([]byte, len(data)) n, err := io.ReadFull(finalReader, buf) if err != nil { - t.Fatalf("ReadFull failed: %s (n=%d)", err, n) + t.Errorf("ReadFull failed: %s (n=%d)", err, n) } if string(buf) != data { - t.Fatalf("expected '%s' but got '%s'", string(data), string(buf)) + t.Errorf("expected '%s' but got '%s'", string(data), string(buf)) } } @@ -65,24 +68,24 @@ func TestCompression(t *testing.T) { checkErr(t, wc.Close(), "closing writer") // make sure Identify correctly chooses this compression method - format, stream, err := Identify(testFilename, compressed) + format, stream, err := Identify(context.Background(), testFilename, compressed) checkErr(t, err, "identifying") - if format.Name() != comp.Name() { - t.Fatalf("expected format %s but got %s", comp.Name(), format.Name()) + if format.Extension() != comp.Extension() { + t.Errorf("expected format %s but got %s", comp.Extension(), format.Extension()) } // read the contents back out and compare decompReader, err := format.(Decompressor).OpenReader(stream) - checkErr(t, err, "opening with decompressor '%s'", format.Name()) + checkErr(t, err, "opening with decompressor '%s'", format.Extension()) data, err := io.ReadAll(decompReader) checkErr(t, err, "reading decompressed data") checkErr(t, decompReader.Close(), "closing decompressor") if !bytes.Equal(data, contents) { - t.Fatalf("not equal to original") + t.Errorf("not equal to original") } } - var cannotIdentifyFromStream = map[string]bool{Brotli{}.Name(): true} + var cannotIdentifyFromStream = map[string]bool{Brotli{}.Extension(): true} for _, f := range formats { // only test compressors @@ -91,11 +94,11 @@ func TestCompression(t *testing.T) { continue } - t.Run(f.Name()+"_with_extension", func(t *testing.T) { - testOK(t, comp, "file"+f.Name()) + t.Run(f.Extension()+"_with_extension", func(t *testing.T) { + testOK(t, comp, "file"+f.Extension()) }) - if !cannotIdentifyFromStream[f.Name()] { - t.Run(f.Name()+"_without_extension", func(t *testing.T) { + if !cannotIdentifyFromStream[f.Extension()] { + t.Run(f.Extension()+"_without_extension", func(t *testing.T) { testOK(t, comp, "") }) } @@ -108,7 +111,7 @@ func checkErr(t *testing.T, err error, msgFmt string, args ...any) { return } args = append(args, err) - t.Fatalf(msgFmt+": %s", args...) + t.Errorf(msgFmt+": %s", args...) } func TestIdentifyDoesNotMatchContentFromTrimmedKnownHeaderHaving0Suffix(t *testing.T) { @@ -142,13 +145,13 @@ func TestIdentifyDoesNotMatchContentFromTrimmedKnownHeaderHaving0Suffix(t *testi } headerTrimmed := tt.header[:headerLen-1] stream := bytes.NewReader(headerTrimmed) - got, _, err := Identify("", stream) + got, _, err := Identify(context.Background(), "", stream) if got != nil { - t.Errorf("no Format expected for trimmed know %s header: found Format= %v", tt.name, got.Name()) + t.Errorf("no Format expected for trimmed know %s header: found Format= %v", tt.name, got.Extension()) return } - if ErrNoMatch != err { - t.Fatalf("ErrNoMatch expected for for trimmed know %s header: err :=%#v", tt.name, err) + if !errors.Is(err, NoMatch) { + t.Errorf("NoMatch expected for for trimmed know %s header: err :=%#v", tt.name, err) return } @@ -185,13 +188,13 @@ func TestIdentifyCanAssessSmallOrNoContent(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, _, err := Identify("", tt.args.stream) + got, _, err := Identify(context.Background(), "", tt.args.stream) if got != nil { - t.Errorf("no Format expected for non archive and not compressed stream: found Format= %v", got.Name()) + t.Errorf("no Format expected for non archive and not compressed stream: found Format=%#v", got) return } - if ErrNoMatch != err { - t.Fatalf("ErrNoMatch expected for non archive and not compressed stream: err :=%#v", err) + if !errors.Is(err, NoMatch) { + t.Errorf("NoMatch expected for non archive and not compressed stream: %#v", err) return } @@ -206,20 +209,20 @@ func compress( buf := bytes.NewBuffer(make([]byte, 0, 128)) cwriter, err := openwriter(buf) if err != nil { - t.Fatalf("fail to open compression writer: compression-name=%s, err=%#v", compName, err) + t.Errorf("fail to open compression writer: compression-name=%s, err=%#v", compName, err) return nil } _, err = cwriter.Write(content) if err != nil { cerr := cwriter.Close() - t.Fatalf( + t.Errorf( "fail to write using compression writer: compression-name=%s, err=%#v, close-err=%#v", compName, err, cerr) return nil } err = cwriter.Close() if err != nil { - t.Fatalf("fail to close compression writer: compression-name=%s, err=%#v", compName, err) + t.Errorf("fail to close compression writer: compression-name=%s, err=%#v", compName, err) return nil } return buf.Bytes() @@ -235,7 +238,7 @@ func archive(t *testing.T, arch Archiver, fname string, fileInfo fs.FileInfo) [] buf := bytes.NewBuffer(make([]byte, 0, 128)) err := arch.Archive(context.TODO(), buf, files) if err != nil { - t.Fatalf("fail to create archive: err=%#v", err) + t.Errorf("fail to create archive: err=%#v", err) return nil } return buf.Bytes() @@ -251,29 +254,24 @@ func newWriteNopCloser(w io.Writer) (io.WriteCloser, error) { } func newTmpTextFile(t *testing.T, content string) (string, fs.FileInfo) { - tmpTxtFile, err := os.CreateTemp("", "TestIdentifyFindFormatByStreamContent-tmp-*.txt") if err != nil { - t.Fatalf("fail to create tmp test file for archive tests: err=%v", err) + t.Errorf("fail to create tmp test file for archive tests: err=%v", err) return "", nil } fname := tmpTxtFile.Name() if _, err = tmpTxtFile.Write([]byte(content)); err != nil { - tmpTxtFile.Close() - os.Remove(fname) - t.Fatalf("fail to write content to tmp-txt-file: err=%#v", err) + t.Errorf("fail to write content to tmp-txt-file: err=%#v", err) return "", nil } if err = tmpTxtFile.Close(); err != nil { - os.Remove(fname) - t.Fatalf("fail to close tmp-txt-file: err=%#v", err) + t.Errorf("fail to close tmp-txt-file: err=%#v", err) return "", nil } fi, err := os.Stat(fname) if err != nil { - os.Remove(fname) - t.Fatalf("fail to get tmp-txt-file stats: err=%v", err) + t.Errorf("fail to get tmp-txt-file stats: err=%v", err) return "", nil } @@ -281,9 +279,9 @@ func newTmpTextFile(t *testing.T, content string) (string, fs.FileInfo) { } func TestIdentifyFindFormatByStreamContent(t *testing.T) { - tmpTxtFileName, tmpTxtFileInfo := newTmpTextFile(t, "this is text") + tmpTxtFileName, tmpTxtFileInfo := newTmpTextFile(t, "this is text that has to be long enough for brotli to match") t.Cleanup(func() { - os.Remove(tmpTxtFileName) + os.RemoveAll(tmpTxtFileName) }) tests := []struct { @@ -293,7 +291,13 @@ func TestIdentifyFindFormatByStreamContent(t *testing.T) { compressorName string wantFormatName string }{ - //TODO add test case for brotli when Brotli.Match() by stream content is implemented + { + name: "should recognize brotli", + openCompressionWriter: Brotli{}.OpenWriter, + content: []byte("this is text, but it has to be long enough to match brotli which doesn't have a magic number"), + compressorName: ".br", + wantFormatName: ".br", + }, { name: "should recognize bz2", openCompressionWriter: Bz2{}.OpenWriter, @@ -389,13 +393,13 @@ func TestIdentifyFindFormatByStreamContent(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { stream := bytes.NewReader(compress(t, tt.compressorName, tt.content, tt.openCompressionWriter)) - got, _, err := Identify("", stream) + got, _, err := Identify(context.Background(), "", stream) if err != nil { - t.Fatalf("should have found a corresponding Format: err :=%+v", err) + t.Errorf("should have found a corresponding Format, but got err=%+v", err) return } - if tt.wantFormatName != got.Name() { - t.Errorf("unexpected format found: expected=%s actual:%s", tt.wantFormatName, got.Name()) + if tt.wantFormatName != got.Extension() { + t.Errorf("unexpected format found: expected=%s actual=%s", tt.wantFormatName, got.Extension()) return } @@ -408,10 +412,10 @@ func TestIdentifyAndOpenZip(t *testing.T) { checkErr(t, err, "opening zip") defer f.Close() - format, reader, err := Identify("test.zip", f) + format, reader, err := Identify(context.Background(), "test.zip", f) checkErr(t, err, "identifying zip") - if format.Name() != ".zip" { - t.Fatalf("unexpected format found: expected=.zip actual:%s", format.Name()) + if format.Extension() != ".zip" { + t.Errorf("unexpected format found: expected=.zip actual=%s", format.Extension()) } err = format.(Extractor).Extract(context.Background(), reader, nil, func(ctx context.Context, f FileInfo) error { @@ -430,25 +434,26 @@ func TestIdentifyASCIIFileStartingWithX(t *testing.T) { // Create a temporary file starting with the letter 'x' tmpFile, err := os.CreateTemp("", "TestIdentifyASCIIFileStartingWithX-tmp-*.txt") if err != nil { - t.Fatalf("fail to create tmp test file for archive tests: err=%v", err) + t.Errorf("fail to create tmp test file for archive tests: err=%v", err) } + defer os.Remove(tmpFile.Name()) _, err = tmpFile.Write([]byte("xThis is a test file")) if err != nil { - t.Fatalf("Failed to write to temp file: %v", err) + t.Errorf("Failed to write to temp file: %v", err) } tmpFile.Close() // Open the file and use the Identify function file, err := os.Open(tmpFile.Name()) if err != nil { - t.Fatalf("Failed to open temp file: %v", err) + t.Errorf("Failed to open temp file: %v", err) } defer file.Close() - _, _, err = Identify(tmpFile.Name(), file) - if !errors.Is(err, ErrNoMatch) { - t.Fatalf("Identify failed: %v", err) + _, _, err = Identify(context.Background(), tmpFile.Name(), file) + if !errors.Is(err, NoMatch) { + t.Errorf("Identify failed: %v", err) } } diff --git a/fs.go b/fs.go index c5f25bda..560727b5 100644 --- a/fs.go +++ b/fs.go @@ -10,48 +10,78 @@ import ( "path" "path/filepath" "slices" - "sort" "strings" "time" ) -// FileSystem opens the file at root as a read-only file system. The root may be a -// path to a directory, archive file, compressed archive file, compressed file, or -// any other file on disk. +// FileSystem identifies the format of the input and returns a read-only file system. +// The input can be a filename, stream, or both. // -// If root is a directory, its contents are accessed directly from the disk's file system. -// If root is an archive file, its contents can be accessed like a normal directory; -// compressed archive files are transparently decompressed as contents are accessed. -// And if root is any other file, it is the only file in the file system; if the file -// is compressed, it is transparently decompressed when read from. +// If only a filename is specified, it may be a path to a directory, archive file, +// compressed archive file, compressed regular file, or any other regular file on +// disk. If the filename is a directory, its contents are accessed directly from +// the device's file system. If the filename is an archive file, the contents can +// be accessed like a normal directory; compressed archive files are transparently +// decompressed as contents are accessed. And if the filename is any other file, it +// is the only file in the returned file system; if the file is compressed, it is +// transparently decompressed when read from. // -// This method essentially offers uniform read access to various kinds of files: -// directories, archives, compressed archives, and individual files are all treated -// the same way. +// If a stream is specified, the filename (if available) is used as a hint to help +// identify its format. Streams of archive files must be able to be made into an +// io.SectionReader (for safe concurrency) which requires io.ReaderAt and io.Seeker +// (to efficiently determine size). The automatic format identification requires +// io.Reader and will use io.Seeker if supported to avoid buffering. // -// Except for zip files, the returned FS values are guaranteed to be fs.ReadDirFS and -// fs.StatFS types, and may also be fs.SubFS. -func FileSystem(ctx context.Context, root string) (fs.FS, error) { - info, err := os.Stat(root) - if err != nil { - return nil, err - } +// Whether the data comes from disk or a stream, it is peeked at to automatically +// detect which format to use. +// +// This function essentially offers uniform read access to various kinds of files: +// directories, archives, compressed archives, individual files, and file streams +// are all treated the same way. +// +// NOTE: The performance of compressed tar archives is not great due to overhead +// with decompression. However, the fs.WalkDir() use case has been optimized to +// create an index on first call to ReadDir(). +func FileSystem(ctx context.Context, filename string, stream ReaderAtSeeker) (fs.FS, error) { + if filename == "" && stream == nil { + return nil, errors.New("no input") + } + + // if an input stream is specified, we'll use that for identification + // and for ArchiveFS (if it's an archive); but if not, we'll open the + // file and read it for identification, but in that case we won't want + // to also use it for the ArchiveFS (because we need to close what we + // opened, and ArchiveFS opens its own files), hence this separate var + idStream := stream + + // if input is only a filename (no stream), check if it's a directory; + // if not, open it so we can determine which format to use (filename + // is not always a good indicator of file format) + if filename != "" && stream == nil { + info, err := os.Stat(filename) + if err != nil { + return nil, err + } - // real folders can be accessed easily - if info.IsDir() { - return os.DirFS(root), nil - } + // real folders can be accessed easily + if info.IsDir() { + return os.DirFS(filename), nil + } - // if any archive formats recognize this file, access it like a folder - file, err := os.Open(root) - if err != nil { - return nil, err + // if any archive formats recognize this file, access it like a folder + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + idStream = file // use file for format identification only } - defer file.Close() - format, _, err := Identify(filepath.Base(root), file) - if errors.Is(err, ErrNoMatch) { - return FileFS{Path: root}, nil // must be an ordinary file + // normally, callers should use the Reader value returned from Identify, but + // our input is a Seeker, so we know the original input value gets returned + format, _, err := Identify(ctx, filepath.Base(filename), idStream) + if errors.Is(err, NoMatch) { + return FileFS{Path: filename}, nil // must be an ordinary file } if err != nil { return nil, fmt.Errorf("identify format: %w", err) @@ -59,38 +89,41 @@ func FileSystem(ctx context.Context, root string) (fs.FS, error) { switch fileFormat := format.(type) { case Extractor: - return &ArchiveFS{Path: root, Format: fileFormat, Context: ctx}, nil - - // case Zip: - // // zip.Reader is more performant than ArchiveFS, because zip.Reader caches content information - // // and zip.Reader can open several content files concurrently because of io.ReaderAt requirement - // // while ArchiveFS can't. - // // zip.Reader doesn't suffer from issue #330 and #310 according to local test (but they should be fixed anyway) - - // // open the file anew, as our original handle will be closed when we return - // file, err := os.Open(root) - // if err != nil { - // return nil, err - // } - // return zip.NewReader(file, info.Size()) - // case Archival: - // // TODO: we only really need Extractor and Decompressor here, not the combined interfaces... - - // // open the file anew, as our original handle will be closed when we return - // file, err := os.Open(root) - // if err != nil { - // return nil, err - // } - // info, err := file.Stat() - // if err != nil { - // return nil, err - // } - // return ArchiveFS{Stream: io.NewSectionReader(file, 0, info.Size()), Format: fileFormat, Context: ctx}, nil + // if no stream was input, return an ArchiveFS that relies on the filepath + if stream == nil { + return &ArchiveFS{Path: filename, Format: fileFormat, Context: ctx}, nil + } + + // otherwise, if a stream was input, return an ArchiveFS that relies on that + + // determine size -- we know that the stream value we get back from + // Identify is the same type as what we input because it is a Seeker + size, err := stream.Seek(0, io.SeekEnd) + if err != nil { + return nil, fmt.Errorf("seeking for size: %w", err) + } + _, err = stream.Seek(0, io.SeekStart) + if err != nil { + return nil, fmt.Errorf("seeking back to beginning: %w", err) + } + + sr := io.NewSectionReader(stream, 0, size) + + return &ArchiveFS{Stream: sr, Format: fileFormat, Context: ctx}, nil + case Compression: - return FileFS{Path: root, Compression: fileFormat}, nil + return FileFS{Path: filename, Compression: fileFormat}, nil } - return nil, fmt.Errorf("unable to create file system rooted at %s due to unsupported file or folder type", root) + return nil, fmt.Errorf("unable to create file system rooted at %s due to unsupported file or folder type", filename) +} + +// ReaderAtSeeker is a type that can read, read at, and seek. +// os.File and io.SectionReader both implement this interface. +type ReaderAtSeeker interface { + io.Reader + io.ReaderAt + io.Seeker } // FileFS allows accessing a file on disk using a consistent file system interface. @@ -125,7 +158,15 @@ func (f FileFS) Open(name string) (fs.File, error) { if err != nil { return nil, err } - return compressedFile{file, r}, nil + return compressedFile{r, closeBoth{file, r}}, nil +} + +// Stat stats the named file, which must be the file used to create the file system. +func (f FileFS) Stat(name string) (fs.FileInfo, error) { + if err := f.checkName(name, "stat"); err != nil { + return nil, err + } + return os.Stat(f.Path) } // ReadDir returns a directory listing with the file as the singular entry. @@ -140,23 +181,18 @@ func (f FileFS) ReadDir(name string) ([]fs.DirEntry, error) { return []fs.DirEntry{fs.FileInfoToDirEntry(info)}, nil } -// Stat stats the named file, which must be the file used to create the file system. -func (f FileFS) Stat(name string) (fs.FileInfo, error) { - if err := f.checkName(name, "stat"); err != nil { - return nil, err - } - return os.Stat(f.Path) -} - // checkName ensures the name is a valid path and also, in the case of // the FileFS, that it is either ".", the filename originally passed in // to create the FileFS, or the base of the filename (name without path). // Other names do not make sense for a FileFS since the FS is only 1 file. func (f FileFS) checkName(name, op string) error { + if name == f.Path { + return nil + } if !fs.ValidPath(name) { - return &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid} + return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid} } - if name != "." && name != f.Path && name != filepath.Base(f.Path) { + if name != "." && name != filepath.Base(f.Path) { return &fs.PathError{Op: op, Path: name, Err: fs.ErrNotExist} } return nil @@ -166,18 +202,8 @@ func (f FileFS) checkName(name, op string) error { // from a decompression reader, and which closes both // that reader and the underlying file. type compressedFile struct { - *os.File - decomp io.ReadCloser -} - -func (cf compressedFile) Read(p []byte) (int, error) { return cf.decomp.Read(p) } -func (cf compressedFile) Close() error { - err := cf.File.Close() - err2 := cf.decomp.Close() - if err2 != nil && err == nil { - err = err2 - } - return err + io.Reader // decompressor + closeBoth // file and decompressor } // ArchiveFS allows reading an archive (or a compressed archive) using a @@ -191,17 +217,39 @@ func (cf compressedFile) Close() error { // access the stream, enabling safe, concurrent access. // // NOTE: Due to Go's file system APIs (see package io/fs), the performance -// of ArchiveFS when used with fs.WalkDir() is poor for archives with lots -// of files (see issue #326). The fs.WalkDir() API requires listing each -// directory's contents in turn, and the only way to ensure we return the -// complete list of folder contents is to traverse the whole archive and -// build a slice; so if this is done for the root of an archive with many -// files, performance tends toward O(n^2) as the entire archive is walked -// for every folder that is enumerated (WalkDir calls ReadDir recursively). -// If you do not need each directory's contents walked in order, please -// prefer calling Extract() from an archive type directly; this will perform -// a O(n) walk of the contents in archive order, rather than the slower -// directory tree order. +// of ArchiveFS can suffer when using fs.WalkDir(). To mitigate this, +// an optimized fs.ReadDirFS has been implemented that indexes the entire +// archive on the first call to ReadDir() (since the entire archive needs +// to be walked for every call to ReadDir() anyway, as archive contents are +// often unordered). The first call to ReadDir(), i.e. near the start of the +// walk, will be slow for large archives, but should be instantaneous after. +// If you don't care about walking a file system in directory order, consider +// calling Extract() on the underlying archive format type directly, which +// walks the archive in entry order, without needing to do any sorting. +// +// Note that fs.FS implementations, including this one, reject paths starting +// with "./". This can be problematic sometimes, as it is not uncommon for +// tarballs to contain a top-level/root directory literally named ".", which +// can happen if a tarball is created in the same directory it is archiving. +// The underlying Extract() calls are faithful to entries with this name, +// but file systems have certain semantics around "." that restrict its use. +// For example, a file named "." cannot be created on a real file system +// because it is a special name that means "current directory". +// +// We had to decide whether to honor the true name in the archive, or honor +// file system semantics. Given that this is a virtual file system and other +// code using the fs.FS APIs will trip over a literal directory named ".", +// we choose to honor file system semantics. Files named "." are ignored; +// directories with this name are effectively transparent; their contents +// get promoted up a directory/level. This means a file at "./x" where "." +// is a literal directory name, its name will be passed in as "x" in +// WalkDir callbacks. If you need the raw, uninterpeted values from an +// archive, use the formats' Extract() method directly. See +// https://github.com/golang/go/issues/70155 for a little more background. +// +// This does have one negative edge case... a tar containing contents like +// [x . ./x] will have a conflict on the file named "x" because "./x" will +// also be accessed with the name of "x". type ArchiveFS struct { // set one of these Path string // path to the archive file on disk, or... @@ -209,10 +257,10 @@ type ArchiveFS struct { Format Extractor // the archive format Prefix string // optional subdirectory in which to root the fs - Context context.Context // optional + Context context.Context // optional; mainly for cancellation - // TODO: probably put a mutex in here; the thing has to be a pointer to compile anyway - contents map[string]FileInfo + // amortizing cache speeds up walks (esp. ReadDir) + contents map[string]fs.FileInfo dirs map[string][]fs.DirEntry } @@ -226,14 +274,35 @@ func (f ArchiveFS) context() context.Context { // Open opens the named file from within the archive. If name is "." then // the archive file itself will be opened as a directory file. -func (f *ArchiveFS) Open(name string) (fs.File, error) { +func (f ArchiveFS) Open(name string) (fs.File, error) { if !fs.ValidPath(name) { - return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid} + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)} } - var archiveFile fs.File + // apply prefix if fs is rooted in a subtree + name = path.Join(f.Prefix, name) + + // if we've already indexed the archive, we can know quickly if the file doesn't exist, + // and we can also return directory files with their entries instantly + if f.contents != nil { + if info, found := f.contents[name]; found { + if info.IsDir() { + if entries, ok := f.dirs[name]; ok { + return &dirFile{info: info, entries: entries}, nil + } + } + } else { + if entries, found := f.dirs[name]; found { + return &dirFile{info: implicitDirInfo{implicitDirEntry{name}}, entries: entries}, nil + } + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)} + } + } + + // if a filename is specified, open the archive file + var archiveFile *os.File var err error - if f.Path != "" { + if f.Stream == nil { archiveFile, err = os.Open(f.Path) if err != nil { return nil, err @@ -250,206 +319,129 @@ func (f *ArchiveFS) Open(name string) (fs.File, error) { return nil, fmt.Errorf("no input; one of Path or Stream must be set") } - // apply prefix if fs is rooted in a subtree - name = path.Join(f.Prefix, name) - // handle special case of opening the archive root - if name == "." && archiveFile != nil { - archiveInfo, err := archiveFile.Stat() + if name == "." { + var archiveInfo fs.FileInfo + if archiveFile != nil { + archiveInfo, err = archiveFile.Stat() + if err != nil { + return nil, err + } + } else { + archiveInfo = implicitDirInfo{ + implicitDirEntry{"."}, + } + } + var entries []fs.DirEntry + entries, err = f.ReadDir(name) if err != nil { return nil, err } - entries, err := f.ReadDir(name) - if err != nil { + if err := archiveFile.Close(); err != nil { return nil, err } return &dirFile{ - extractedFile: extractedFile{ - FileInfo: FileInfo{ - FileInfo: dirFileInfo{archiveInfo}, - NameInArchive: ".", - }, - }, + info: dirFileInfo{archiveInfo}, entries: entries, }, nil } - var ( - files []FileInfo - found bool - ) - // collect them all or stop at exact file match, note we don't stop at folder match - handler := func(_ context.Context, file FileInfo) error { - file.NameInArchive = strings.Trim(file.NameInArchive, "/") - files = append(files, file) - if file.NameInArchive == name && !file.IsDir() { - found = true - return errStopWalk - } - return nil - } - var inputStream io.Reader if f.Stream == nil { - // when the archive file is closed, any (soon-to-be) associated decompressor should also be closed; see #365 - archiveFile = &closeBoth{File: archiveFile} inputStream = archiveFile } else { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, []string{name}, handler) - if found { - err = nil - } - if err != nil { - return nil, err - } - - if len(files) == 0 { - return nil, fs.ErrNotExist + var decompressor io.ReadCloser + if caf, ok := f.Format.(CompressedArchive); ok { + if caf.Compression != nil { + decompressor, err = caf.Compression.OpenReader(inputStream) + if err != nil { + return nil, err + } + inputStream = decompressor + } } - // exactly one or exact file found, test name match to detect implicit dir name https://github.com/mholt/archiver/issues/340 - if (len(files) == 1 && files[0].NameInArchive == name) || found { - file := files[len(files)-1] - if file.IsDir() { - return &dirFile{extractedFile: extractedFile{FileInfo: file}}, nil + // prepare the handler that we'll need if we have to iterate the + // archive to find the file being requested + var fsFile fs.File + handler := func(ctx context.Context, file FileInfo) error { + if err := ctx.Err(); err != nil { + return err } - // if named file is not a regular file, it can't be opened - if !file.Mode().IsRegular() { - return extractedFile{FileInfo: file}, nil + // paths in archives can't necessarily be trusted; also clean up any "./" prefix + file.NameInArchive = path.Clean(file.NameInArchive) + + if !strings.HasPrefix(file.NameInArchive, name) { + return nil } - // regular files can be read, so open it for reading - rc, err := file.Open() - if err != nil { - return nil, err + // if this is the requested file, and it's a directory, set up the dirFile, + // which will include a listing of all its contents as we continue the walk + if file.NameInArchive == name && file.IsDir() { + fsFile = &dirFile{info: file} // will fill entries slice as we continue the walk + return nil } - return extractedFile{FileInfo: file, archiveCloser: archiveCloser{rc, archiveFile}}, nil - } - // implicit files - files = fillImplicit(files) - file, foundFile := search(name, files) - if !foundFile { - return nil, fs.ErrNotExist - } + // if the named file was a directory and we are filling its entries, + // add this entry to the list + if df, ok := fsFile.(*dirFile); ok { + df.entries = append(df.entries, fs.FileInfoToDirEntry(file)) - if file.IsDir() { - return &dirFile{extractedFile: extractedFile{FileInfo: file}, entries: openReadDir(name, files)}, nil - } + // don't traverse into subfolders + if file.IsDir() { + return fs.SkipDir + } - // very unlikely - // maybe just panic, because extractor already walk through all the entries, file is impossible to read - // unless it's from a zip file. + return nil + } - // if named file is not a regular file, it can't be opened - if !file.Mode().IsRegular() { - return extractedFile{FileInfo: file}, nil - } + innerFile, err := file.Open() + if err != nil { + return err + } - // regular files can be read, so open it for reading - rc, err := file.Open() - if err != nil { - return nil, err - } - return extractedFile{FileInfo: file, archiveCloser: archiveCloser{rc, archiveFile}}, nil -} + fsFile = closeBoth{File: innerFile, c: archiveFile} -// copy of the same function from zip -func split(name string) (dir, elem string, isDir bool) { - if name[len(name)-1] == '/' { - isDir = true - name = name[:len(name)-1] - } - i := len(name) - 1 - for i >= 0 && name[i] != '/' { - i-- - } - if i < 0 { - return ".", name, isDir + if decompressor != nil { + fsFile = closeBoth{fsFile, decompressor} + } + + return fs.SkipAll } - return name[:i], name[i+1:], isDir -} -// modified from zip.Reader initFileList, it's used to find all implicit dirs -func fillImplicit(files []FileInfo) []FileInfo { - dirs := make(map[string]bool) - knownDirs := make(map[string]bool) - entries := make([]FileInfo, 0) - for _, file := range files { - for dir := path.Dir(file.NameInArchive); dir != "."; dir = path.Dir(dir) { - dirs[dir] = true - } - entries = append(entries, file) - if file.IsDir() { - knownDirs[file.NameInArchive] = true - } + // when we start the walk, we pass in a nil list of files to extract, since + // files may have a "." component in them, and the underlying format doesn't + // know about our file system semantics, so we need to filter ourselves (it's + // not significantly less efficient). + if caf, ok := f.Format.(CompressedArchive); ok { + // bypass the CompressedArchive format's opening of the decompressor, since + // we already did it, since we need to keep it open after returning + // "I BYPASSED THE COMPRESSOR!" -Rey + err = caf.Archival.Extract(f.context(), inputStream, nil, handler) + } else { + err = f.Format.Extract(f.context(), inputStream, nil, handler) } - for dir := range dirs { - if !knownDirs[dir] { - entries = append(entries, FileInfo{FileInfo: implicitDirInfo{implicitDirEntry{path.Base(dir)}}, NameInArchive: dir}) - } + if err != nil { + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("extract: %w", err)} } - - sort.Slice(entries, func(i, j int) bool { - fi, fj := entries[i], entries[j] - di, ei, _ := split(fi.NameInArchive) - dj, ej, _ := split(fj.NameInArchive) - - if di != dj { - return di < dj - } - return ei < ej - }) - return entries -} - -// modified from zip.Reader openLookup -func search(name string, entries []FileInfo) (FileInfo, bool) { - dir, elem, _ := split(name) - i := sort.Search(len(entries), func(i int) bool { - idir, ielem, _ := split(entries[i].NameInArchive) - return idir > dir || idir == dir && ielem >= elem - }) - if i < len(entries) { - fname := entries[i].NameInArchive - if fname == name || len(fname) == len(name)+1 && fname[len(name)] == '/' && fname[:len(name)] == name { - return entries[i], true - } + if fsFile == nil { + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)} } - return FileInfo{}, false -} -// modified from zip.Reader openReadDir -func openReadDir(dir string, entries []FileInfo) []fs.DirEntry { - i := sort.Search(len(entries), func(i int) bool { - idir, _, _ := split(entries[i].NameInArchive) - return idir >= dir - }) - j := sort.Search(len(entries), func(j int) bool { - jdir, _, _ := split(entries[j].NameInArchive) - return jdir > dir - }) - dirs := make([]fs.DirEntry, j-i) - for idx := range dirs { - dirs[idx] = fs.FileInfoToDirEntry(entries[i+idx]) - } - return dirs + return fsFile, nil } // Stat stats the named file from within the archive. If name is "." then // the archive file itself is statted and treated as a directory file. -func (f *ArchiveFS) Stat(name string) (fs.FileInfo, error) { +func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) { if !fs.ValidPath(name) { - return nil, &fs.PathError{Op: "stat", Path: name, Err: fs.ErrInvalid} + return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%s: %w", name, fs.ErrInvalid)} } - // apply prefix if fs is rooted in a subtree - name = path.Join(f.Prefix, name) - if name == "." { if f.Path != "" { fileInfo, err := os.Stat(f.Path) @@ -462,6 +454,17 @@ func (f *ArchiveFS) Stat(name string) (fs.FileInfo, error) { } } + // apply prefix if fs is rooted in a subtree + name = path.Join(f.Prefix, name) + + // if archive has already been indexed, simply use it + if f.contents != nil { + if info, ok := f.contents[name]; ok { + return info, nil + } + return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat %s: %w", name, fs.ErrNotExist)} + } + var archiveFile *os.File var err error if f.Stream == nil { @@ -472,16 +475,14 @@ func (f *ArchiveFS) Stat(name string) (fs.FileInfo, error) { defer archiveFile.Close() } - var ( - files []FileInfo - found bool - ) - handler := func(_ context.Context, file FileInfo) error { - file.NameInArchive = strings.Trim(file.NameInArchive, "/") - files = append(files, file) - if file.NameInArchive == name { - found = true - return errStopWalk + var result FileInfo + handler := func(ctx context.Context, file FileInfo) error { + if err := ctx.Err(); err != nil { + return err + } + if path.Clean(file.NameInArchive) == name { + result = file + return fs.SkipAll } return nil } @@ -489,37 +490,26 @@ func (f *ArchiveFS) Stat(name string) (fs.FileInfo, error) { if f.Stream != nil { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, []string{name}, handler) - if found { - err = nil - } - if err != nil { + err = f.Format.Extract(f.context(), inputStream, nil, handler) + if err != nil && result.FileInfo == nil { return nil, err } - - // exactly one or exact file found, test name match to detect implicit dir name https://github.com/mholt/archiver/issues/340 - if (len(files) == 1 && files[0].NameInArchive == name) || found { - return files[len(files)-1].FileInfo, nil - } - - files = fillImplicit(files) - file, found := search(name, files) - if !found { + if result.FileInfo == nil { return nil, fs.ErrNotExist } - return file.FileInfo, nil + return result.FileInfo, nil } -// TODO: ReadDir, Open, Stat, etc, all involve a walk of up to the entire archive, -// which is slow when using fs.WalkDir -- which calls ReadDir many times, and then if -// we call Open for each file, it's exponentially slow! Can we potentially add memory or something? - -// ReadDir reads the named directory from within the archive. +// ReadDir reads the named directory from within the archive. If name is "." +// then the root of the archive content is listed. func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { if !fs.ValidPath(name) { return nil, &fs.PathError{Op: "readdir", Path: name, Err: fs.ErrInvalid} } + // apply prefix if fs is rooted in a subtree + name = path.Join(f.Prefix, name) + // fs.WalkDir() calls ReadDir() once per directory, and for archives with // lots of directories, that is very slow, since we have to traverse the // entire archive in order to ensure that we got all the entries for a @@ -529,7 +519,7 @@ func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { return f.dirs[name], nil } - f.contents = make(map[string]FileInfo) + f.contents = make(map[string]fs.FileInfo) f.dirs = make(map[string][]fs.DirEntry) var archiveFile *os.File @@ -542,10 +532,11 @@ func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { defer archiveFile.Close() } - // apply prefix if fs is rooted in a subtree - name = path.Join(f.Prefix, name) + handler := func(ctx context.Context, file FileInfo) error { + if err := ctx.Err(); err != nil { + return err + } - handler := func(_ context.Context, file FileInfo) error { // can't always trust path names file.NameInArchive = path.Clean(file.NameInArchive) @@ -583,6 +574,10 @@ func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { // the base (last component) of the path until no separators remain, i.e. only // one component remains -- then loop again to make sure it's not a duplicate for dir, base := path.Dir(file.NameInArchive), path.Base(file.NameInArchive); ; dir, base = path.Dir(dir), path.Base(dir) { + if err := ctx.Err(); err != nil { + return err + } + var dirInfo fs.DirEntry = implicitDirInfo{implicitDirEntry{base}} // we are "filling in" any directories that could potentially be only implicit, @@ -605,18 +600,12 @@ func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { return nil } - // handle special case of reading from root of archive - var filter []string - if name != "." { - filter = []string{name} - } - var inputStream io.Reader = archiveFile if f.Stream != nil { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, filter, handler) + err = f.Format.Extract(f.context(), inputStream, nil, handler) if err != nil { // these being non-nil implies that we have indexed the archive, // but if an error occurred, we likely only got part of the way @@ -624,26 +613,9 @@ func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { // the whole thing anyway; so reset these to nil to avoid bugs f.dirs = nil f.contents = nil - return nil, err + return nil, fmt.Errorf("extract: %w", err) } - // // always find all implicit directories - // files = fillImplicit(files) - // // and return early for dot file - // if name == "." { - // return openReadDir(name, files), nil - // } - - // file, foundFile := search(name, files) - // if !foundFile { - // return nil, fs.ErrNotExist - // } - - // if !file.IsDir() { - // return nil, &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a dir")} - // } - // return openReadDir(name, files), nil - return f.dirs[name], nil } @@ -733,34 +705,18 @@ func pathWithoutTopDir(fpath string) string { return fpath[slashIdx+1:] } -// errStopWalk is an arbitrary error value, since returning -// any error (other than fs.SkipDir) will stop a walk. We -// use this as we may only want 1 file from an extraction, -// even if that file is a directory and would otherwise be -// traversed during the walk. -var errStopWalk = fmt.Errorf("stop walk") - // dirFile implements the fs.ReadDirFile interface. type dirFile struct { - extractedFile - - // TODO: We could probably be more memory-efficient by not loading - // all the entries at once and then "faking" the paging for ReadDir(). - // Instead, we could maybe store a reference to the parent archive FS, - // then walk it each time ReadDir is called, skipping entriesRead - // files, then continuing the listing, until n are listed. But that - // might be kinda messy and a lot of work, so I leave it for a future - // optimization if needed. + info fs.FileInfo entries []fs.DirEntry - entriesRead int + entriesRead int // used for paging with ReadDir(n) } -// If this represents the root of the archive, we use the archive's -// FileInfo which says it's a file, not a directory; the whole point -// of this package is to treat the archive as a directory, so always -// return true in our case. -func (dirFile) IsDir() bool { return true } +func (dirFile) Read([]byte) (int, error) { return 0, errors.New("cannot read a directory file") } +func (df dirFile) Stat() (fs.FileInfo, error) { return df.info, nil } +func (dirFile) Close() error { return nil } +// ReadDir implements [fs.ReadDirFile]. func (df *dirFile) ReadDir(n int) ([]fs.DirEntry, error) { if n <= 0 { return df.entries, nil @@ -789,62 +745,14 @@ func (dirFileInfo) Size() int64 { return 0 } func (info dirFileInfo) Mode() fs.FileMode { return info.FileInfo.Mode() | fs.ModeDir } func (dirFileInfo) IsDir() bool { return true } -// archiveCloser is an fs.File that, when closed, will also close the parent archive. -// This is useful sometimes when extracting a single file from an archive and using it -// in a different place from which the archive was opened. -type archiveCloser struct { - // The extracted file that came out of the archive. - fs.File - - // The archive file that contained the extracted file. - Archive io.Closer -} - -func (ac archiveCloser) Close() error { - if ac.File != nil { - if err := ac.File.Close(); err != nil { - return fmt.Errorf("closing extracted file: %w", err) - } - } - if ac.Archive != nil { - if err := ac.Archive.Close(); err != nil { - return fmt.Errorf("closing archive: %w", err) - } - } - return nil -} - -// extractedFile implements fs.File, thus it represents an "opened" file, -// which is slightly different from our File type which represents a file -// that possibly may be opened. If the file is actually opened, this type -// ensures that the parent archive is closed when this file from within it -// is also closed. -type extractedFile struct { - FileInfo - - // Set this field if a "regular file" which has actual content - // that can be read, i.e. a file that is open for reading. - // ReadCloser should be the file's reader, and parentArchive is - // a reference to the archive the files comes out of. - // If parentArchive is set, it will also be closed along with - // the file when Close() is called. - archiveCloser -} - -type archivedFile struct { +// fileInArchive represents a file that is opened from within an archive. +// It implements fs.File. +type fileInArchive struct { io.ReadCloser info fs.FileInfo } -func (af archivedFile) Stat() (fs.FileInfo, error) { return af.info, nil } - -// compressorCloser is a type that closes two closers at the same time. -// It only exists to fix #365. If a better solution can be found, I'd -// likely prefer it. -type compressorCloser interface { - io.Closer - closeCompressor(io.Closer) -} +func (af fileInArchive) Stat() (fs.FileInfo, error) { return af.info, nil } // closeBoth closes both the file and an associated // closer, such as a (de)compressor that wraps the @@ -852,28 +760,34 @@ type compressorCloser interface { // better solution is found, I'd probably prefer that. type closeBoth struct { fs.File - c io.Closer + c io.Closer // usually the archive or the decompressor } -// closeCompressor will have the closer closed when the associated File closes. -func (dc *closeBoth) closeCompressor(c io.Closer) { dc.c = c } - // Close closes both the file and the associated closer. It always calls -// Close() on both, but returns only the first error, if any. +// Close() on both, but if multiple errors occur they are wrapped together. func (dc closeBoth) Close() error { - err1, err2 := dc.File.Close(), dc.c.Close() - if err1 != nil { - return err1 + var err error + if dc.File != nil { + if err2 := dc.File.Close(); err2 != nil { + err = fmt.Errorf("closing file: %w", err2) + } } - return err2 + if dc.c != nil { + if err2 := dc.c.Close(); err2 != nil { + if err == nil { + err = fmt.Errorf("closing closer: %w", err2) + } else { + err = fmt.Errorf("%w; additionally, closing closer: %w", err, err2) + } + } + } + return err } // implicitDirEntry represents a directory that does // not actually exist in the archive but is inferred // from the paths of actual files in the archive. -type implicitDirEntry struct { - name string -} +type implicitDirEntry struct{ name string } func (e implicitDirEntry) Name() string { return e.name } func (implicitDirEntry) IsDir() bool { return true } @@ -887,9 +801,7 @@ func (e implicitDirEntry) Info() (fs.FileInfo, error) { // not contain actual entries for a directory, but we need to // pretend it exists so its contents can be discovered and // traversed. -type implicitDirInfo struct { - implicitDirEntry -} +type implicitDirInfo struct{ implicitDirEntry } func (d implicitDirInfo) Name() string { return d.name } func (implicitDirInfo) Size() int64 { return 0 } @@ -905,6 +817,4 @@ var ( _ fs.ReadDirFS = (*ArchiveFS)(nil) _ fs.StatFS = (*ArchiveFS)(nil) _ fs.SubFS = (*ArchiveFS)(nil) - - _ compressorCloser = (*closeBoth)(nil) ) diff --git a/fs_test.go b/fs_test.go index bbb56b55..5d6a8bd3 100644 --- a/fs_test.go +++ b/fs_test.go @@ -58,11 +58,11 @@ func TestSelfTar(t *testing.T) { fn := "testdata/self-tar.tar" fh, err := os.Open(fn) if err != nil { - t.Fatalf("Could not load test tar: %v", fn) + t.Errorf("Could not load test tar: %v", fn) } fstat, err := os.Stat(fn) if err != nil { - t.Fatalf("Could not stat test tar: %v", fn) + t.Errorf("Could not stat test tar: %v", fn) } fsys := &ArchiveFS{ Stream: io.NewSectionReader(fh, 0, fstat.Size()), @@ -78,7 +78,7 @@ func TestSelfTar(t *testing.T) { return nil }) if err != nil { - t.Fatal(err) + t.Error(err) } } @@ -158,9 +158,7 @@ func TestArchiveFS_ReadDir(t *testing.T) { t.Parallel() fsys := tc.archive for baseDir, wantLS := range tc.want { - baseDir := baseDir - wantLS := wantLS - t.Run(fmt.Sprintf("ReadDir(%s)", baseDir), func(t *testing.T) { + t.Run(fmt.Sprintf("ReadDir(%q)", baseDir), func(t *testing.T) { dis, err := fsys.ReadDir(baseDir) if err != nil { t.Error(err) @@ -183,17 +181,18 @@ func TestArchiveFS_ReadDir(t *testing.T) { t.Run(fmt.Sprintf("Open(%s)", baseDir), func(t *testing.T) { f, err := fsys.Open(baseDir) if err != nil { - t.Error(err) + t.Errorf("fsys.Open(%q): %#v %s", baseDir, err, err) + return } rdf, ok := f.(fs.ReadDirFile) if !ok { - t.Fatalf("'%s' did not return a fs.ReadDirFile, %+v", baseDir, rdf) + t.Errorf("fsys.Open(%q) did not return a fs.ReadDirFile, got: %#v", baseDir, f) } dis, err := rdf.ReadDir(-1) if err != nil { - t.Fatal(err) + t.Error(err) } dirs := []string{} diff --git a/go.mod b/go.mod index 3adbffad..0dacae9b 100644 --- a/go.mod +++ b/go.mod @@ -1,26 +1,26 @@ module github.com/mholt/archiver/v4 -go 1.22 +go 1.22.2 -toolchain go1.22.2 +toolchain go1.23.2 require ( - github.com/andybalholm/brotli v1.1.0 + github.com/andybalholm/brotli v1.1.1 github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 - github.com/klauspost/compress v1.17.8 + github.com/klauspost/compress v1.17.11 github.com/klauspost/pgzip v1.2.6 - github.com/nwaples/rardecode/v2 v2.0.0-beta.3 + github.com/nwaples/rardecode/v2 v2.0.0-beta.4 github.com/therootcompany/xz v1.0.1 github.com/ulikunitz/xz v0.5.12 ) require ( github.com/STARRY-S/zip v0.1.0 - github.com/bodgit/sevenzip v1.5.1 + github.com/bodgit/sevenzip v1.5.2 github.com/golang/snappy v0.0.4 github.com/pierrec/lz4/v4 v4.1.21 github.com/sorairolake/lzip-go v0.3.5 - golang.org/x/text v0.16.0 + golang.org/x/text v0.19.0 ) require ( diff --git a/go.sum b/go.sum index e1803b1c..5e844566 100644 --- a/go.sum +++ b/go.sum @@ -19,12 +19,12 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/STARRY-S/zip v0.1.0 h1:eUER3jKmHKXjv+iy3BekLa+QnNSo1Lqz4eTzYBcGDqo= github.com/STARRY-S/zip v0.1.0/go.mod h1:qj/mTZkvb3AvfGQ2e775/3AODRvB4peSw8KNMvrM8/I= -github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= -github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= +github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= github.com/bodgit/plumbing v1.3.0 h1:pf9Itz1JOQgn7vEOE7v7nlEfBykYqvUYioC61TwWCFU= github.com/bodgit/plumbing v1.3.0/go.mod h1:JOTb4XiRu5xfnmdnDJo6GmSbSbtSyufrsyZFByMtKEs= -github.com/bodgit/sevenzip v1.5.1 h1:rVj0baZsooZFy64DJN0zQogPzhPrT8BQ8TTRd1H4WHw= -github.com/bodgit/sevenzip v1.5.1/go.mod h1:Q3YMySuVWq6pyGEolyIE98828lOfEoeWg5zeH6x22rc= +github.com/bodgit/sevenzip v1.5.2 h1:acMIYRaqoHAdeu9LhEGGjL9UzBD4RNf9z7+kWDNignI= +github.com/bodgit/sevenzip v1.5.2/go.mod h1:gTGzXA67Yko6/HLSD0iK4kWaWzPlPmLfDO73jTjSRqc= github.com/bodgit/windows v1.0.1 h1:tF7K6KOluPYygXa3Z2594zxlkbKPAOvqr97etrGNIz4= github.com/bodgit/windows v1.0.1/go.mod h1:a6JLwrB4KrTR5hBpp8FI9/9W9jJfeQ2h4XDXU74ZCdM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -84,16 +84,16 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= -github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU= -github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/nwaples/rardecode/v2 v2.0.0-beta.3 h1:evQTW0IjM2GAL5AaPHiQrT+laWohkt5zHKA3yCsGQGU= -github.com/nwaples/rardecode/v2 v2.0.0-beta.3/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY= +github.com/nwaples/rardecode/v2 v2.0.0-beta.4 h1:sdiJxQdPjECn2lh9nLFFhgLCf+0ulDU5rODbtERTlUY= +github.com/nwaples/rardecode/v2 v2.0.0-beta.4/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -117,6 +117,8 @@ github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0B github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= @@ -183,8 +185,8 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -212,8 +214,8 @@ golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/gz.go b/gz.go index b9873f19..e8b3f98d 100644 --- a/gz.go +++ b/gz.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -29,13 +30,13 @@ type Gz struct { Multithreaded bool } -func (Gz) Name() string { return ".gz" } +func (Gz) Extension() string { return ".gz" } -func (gz Gz) Match(filename string, stream io.Reader) (MatchResult, error) { +func (gz Gz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), gz.Name()) { + if strings.Contains(strings.ToLower(filename), gz.Extension()) { mr.ByName = true } diff --git a/interfaces.go b/interfaces.go index 654511d6..f675f0e2 100644 --- a/interfaces.go +++ b/interfaces.go @@ -5,10 +5,12 @@ import ( "io" ) -// Format represents either an archive or compression format. +// Format represents a way of getting data out of something else. +// A format usually represents compression or an archive (or both). type Format interface { - // Name returns the name of the format. - Name() string + // Extension returns the conventional file extension for this + // format. + Extension() string // Match returns true if the given name/stream is recognized. // One of the arguments is optional: filename might be empty @@ -21,7 +23,7 @@ type Format interface { // preserve the stream through matching, you should either // buffer what is read by Match, or seek to the last position // before Match was called. - Match(filename string, stream io.Reader) (MatchResult, error) + Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) } // Compression is a compression format with both compress and decompress methods. @@ -83,14 +85,20 @@ type ArchiverAsync interface { // Extractor can extract files from an archive. type Extractor interface { - // Extract reads the files at pathsInArchive from sourceArchive. + // Extract walks entries in the archive and calls handleFile for each + // entry that matches the pathsInArchive filter by path/name. + // // If pathsInArchive is nil, all files are extracted without discretion. // If pathsInArchive is empty, no files are extracted. // If a path refers to a directory, all files within it are extracted. // Extracted files are passed to the handleFile callback for handling. // + // Any files opened in the FileHandler should be closed when it returns, + // as there is no guarantee the files can be read outside the handler + // or after the walk has proceeded to the next file. + // // Context cancellation must be honored. - Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error + Extract(ctx context.Context, archive io.Reader, pathsInArchive []string, handleFile FileHandler) error } // Inserter can insert files into an existing archive. diff --git a/lz4.go b/lz4.go index aaa22a54..7425ad2a 100644 --- a/lz4.go +++ b/lz4.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -17,13 +18,13 @@ type Lz4 struct { CompressionLevel int } -func (Lz4) Name() string { return ".lz4" } +func (Lz4) Extension() string { return ".lz4" } -func (lz Lz4) Match(filename string, stream io.Reader) (MatchResult, error) { +func (lz Lz4) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), lz.Name()) { + if strings.Contains(strings.ToLower(filename), lz.Extension()) { mr.ByName = true } diff --git a/lzip.go b/lzip.go index a861a487..1cbffa50 100644 --- a/lzip.go +++ b/lzip.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "path/filepath" "strings" @@ -16,13 +17,13 @@ func init() { // Lzip facilitates lzip compression. type Lzip struct{} -func (Lzip) Name() string { return ".lz" } +func (Lzip) Extension() string { return ".lz" } -func (lz Lzip) Match(filename string, stream io.Reader) (MatchResult, error) { +func (lz Lzip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if filepath.Ext(strings.ToLower(filename)) == lz.Name() { + if filepath.Ext(strings.ToLower(filename)) == lz.Extension() { mr.ByName = true } diff --git a/rar.go b/rar.go index 8cfc93b5..bece6071 100644 --- a/rar.go +++ b/rar.go @@ -30,13 +30,13 @@ type Rar struct { Password string } -func (Rar) Name() string { return ".rar" } +func (Rar) Extension() string { return ".rar" } -func (r Rar) Match(filename string, stream io.Reader) (MatchResult, error) { +func (r Rar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), r.Name()) { + if strings.Contains(strings.ToLower(filename), r.Extension()) { mr.ByName = true } @@ -104,7 +104,7 @@ func (r Rar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv Header: hdr, NameInArchive: hdr.Name, Open: func() (fs.File, error) { - return archivedFile{io.NopCloser(rr), info}, nil + return fileInArchive{io.NopCloser(rr), info}, nil }, } diff --git a/sz.go b/sz.go index 9d10604a..8a926b7f 100644 --- a/sz.go +++ b/sz.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -15,13 +16,13 @@ func init() { // Sz facilitates Snappy compression. type Sz struct{} -func (sz Sz) Name() string { return ".sz" } +func (sz Sz) Extension() string { return ".sz" } -func (sz Sz) Match(filename string, stream io.Reader) (MatchResult, error) { +func (sz Sz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), sz.Name()) { + if strings.Contains(strings.ToLower(filename), sz.Extension()) { mr.ByName = true } diff --git a/tar.go b/tar.go index 37f81597..d4106257 100644 --- a/tar.go +++ b/tar.go @@ -26,13 +26,13 @@ type Tar struct { ContinueOnError bool } -func (Tar) Name() string { return ".tar" } +func (Tar) Extension() string { return ".tar" } -func (t Tar) Match(filename string, stream io.Reader) (MatchResult, error) { +func (t Tar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), t.Name()) { + if strings.Contains(strings.ToLower(filename), t.Extension()) { mr.ByName = true } @@ -219,12 +219,18 @@ func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv NameInArchive: hdr.Name, LinkTarget: hdr.Linkname, Open: func() (fs.File, error) { - return archivedFile{io.NopCloser(tr), info}, nil + return fileInArchive{io.NopCloser(tr), info}, nil }, } err = handleFile(ctx, file) if errors.Is(err, fs.SkipAll) { + // At first, I wasn't sure if fs.SkipAll implied that the rest of the entries + // should still be iterated and just "skipped" (i.e. no-ops) or if the walk + // should stop; both have the same net effect, one is just less efficient... + // apparently the name of fs.StopWalk was the preferred name, but it still + // became fs.SkipAll because of semantics with documentation; see + // https://github.com/golang/go/issues/47209 -- anyway, the walk should stop. break } else if errors.Is(err, fs.SkipDir) { // if a directory, skip this path; if a file, skip the folder path diff --git a/xz.go b/xz.go index 4e1b6b41..edb61373 100644 --- a/xz.go +++ b/xz.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -16,13 +17,13 @@ func init() { // Xz facilitates xz compression. type Xz struct{} -func (Xz) Name() string { return ".xz" } +func (Xz) Extension() string { return ".xz" } -func (x Xz) Match(filename string, stream io.Reader) (MatchResult, error) { +func (x Xz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), x.Name()) { + if strings.Contains(strings.ToLower(filename), x.Extension()) { mr.ByName = true } diff --git a/zip.go b/zip.go index 0e79ed69..c012c080 100644 --- a/zip.go +++ b/zip.go @@ -83,13 +83,13 @@ type Zip struct { TextEncoding string } -func (z Zip) Name() string { return ".zip" } +func (z Zip) Extension() string { return ".zip" } -func (z Zip) Match(filename string, stream io.Reader) (MatchResult, error) { +func (z Zip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), z.Name()) { + if strings.Contains(strings.ToLower(filename), z.Extension()) { mr.ByName = true } @@ -228,7 +228,7 @@ func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv if err != nil { return nil, err } - return archivedFile{openedFile, info}, nil + return fileInArchive{openedFile, info}, nil }, } diff --git a/zlib.go b/zlib.go index 84275186..485991e6 100644 --- a/zlib.go +++ b/zlib.go @@ -1,6 +1,7 @@ package archiver import ( + "context" "io" "strings" @@ -16,13 +17,13 @@ type Zlib struct { CompressionLevel int } -func (Zlib) Name() string { return ".zz" } +func (Zlib) Extension() string { return ".zz" } -func (zz Zlib) Match(filename string, stream io.Reader) (MatchResult, error) { +func (zz Zlib) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), zz.Name()) { + if strings.Contains(strings.ToLower(filename), zz.Extension()) { mr.ByName = true } diff --git a/zstd.go b/zstd.go index fe07b76f..cd0c2814 100644 --- a/zstd.go +++ b/zstd.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -18,13 +19,13 @@ type Zstd struct { DecoderOptions []zstd.DOption } -func (Zstd) Name() string { return ".zst" } +func (Zstd) Extension() string { return ".zst" } -func (zs Zstd) Match(filename string, stream io.Reader) (MatchResult, error) { +func (zs Zstd) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), zs.Name()) { + if strings.Contains(strings.ToLower(filename), zs.Extension()) { mr.ByName = true }