From 5ca89020d99e76c99c3788292365e34be1f3787f Mon Sep 17 00:00:00 2001
From: Matthew Holt <mholt@users.noreply.github.com>
Date: Thu, 31 Oct 2024 15:20:54 -0600
Subject: [PATCH] More WIP

---
 archiver.go |   6 ++-
 fs.go       | 151 ++++++++++++++++++++++++++++++++--------------------
 fs_test.go  |   4 +-
 3 files changed, 100 insertions(+), 61 deletions(-)

diff --git a/archiver.go b/archiver.go
index a6c96d44..cc74af47 100644
--- a/archiver.go
+++ b/archiver.go
@@ -28,6 +28,10 @@ type FileInfo struct {
 	// format-agnosticism (no type assertions) for basic
 	// operations.
 	//
+	// When extracting, this name or path may not have
+	// been sanitized; it should not be trusted at face
+	// value. Consider using path.Clean() before using.
+	//
 	// EXPERIMENTAL: If inserting a file into an archive,
 	// and this is left blank, the implementation of the
 	// archive format can default to using the file's base
@@ -214,7 +218,7 @@ type FromDiskOptions struct {
 // archive contents are not necessarily ordered, skipping directories requires
 // memory, and skipping lots of directories may run up your memory bill.
 //
-// Any other returned error will terminate a walk.
+// Any other returned error will terminate a walk and be returned to the caller.
 type FileHandler func(ctx context.Context, f FileInfo) error
 
 // openAndCopyFile opens file for reading, copies its
diff --git a/fs.go b/fs.go
index 701ec61f..c5f25bda 100644
--- a/fs.go
+++ b/fs.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"slices"
 	"sort"
 	"strings"
 	"time"
@@ -58,7 +59,7 @@ func FileSystem(ctx context.Context, root string) (fs.FS, error) {
 
 	switch fileFormat := format.(type) {
 	case Extractor:
-		return ArchiveFS{Path: root, Format: fileFormat, Context: ctx}, nil
+		return &ArchiveFS{Path: root, Format: fileFormat, Context: ctx}, nil
 
 	// case Zip:
 	// 	// zip.Reader is more performant than ArchiveFS, because zip.Reader caches content information
@@ -210,18 +211,11 @@ type ArchiveFS struct {
 	Prefix  string          // optional subdirectory in which to root the fs
 	Context context.Context // optional
 
-	contents map[string][]fs.DirEntry
+	// TODO: probably put a mutex in here; the thing has to be a pointer to compile anyway
+	contents map[string]FileInfo
+	dirs     map[string][]fs.DirEntry
 }
 
-type fsIndex struct {
-}
-
-// type archiveEntry struct {
-// 	path  string
-// 	entry fs.DirEntry
-// 	info  fs.FileInfo
-// }
-
 // context always return a context, preferring f.Context if not nil.
 func (f ArchiveFS) context() context.Context {
 	if f.Context != nil {
@@ -232,7 +226,7 @@ func (f ArchiveFS) context() context.Context {
 
 // Open opens the named file from within the archive. If name is "." then
 // the archive file itself will be opened as a directory file.
-func (f ArchiveFS) Open(name string) (fs.File, error) {
+func (f *ArchiveFS) Open(name string) (fs.File, error) {
 	if !fs.ValidPath(name) {
 		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid}
 	}
@@ -252,8 +246,8 @@ func (f ArchiveFS) Open(name string) (fs.File, error) {
 				archiveFile.Close()
 			}
 		}()
-	} else if f.Stream != nil {
-		archiveFile = fakeArchiveFile{}
+	} else if f.Stream == nil {
+		return nil, fmt.Errorf("no input; one of Path or Stream must be set")
 	}
 
 	// apply prefix if fs is rooted in a subtree
@@ -448,7 +442,7 @@ func openReadDir(dir string, entries []FileInfo) []fs.DirEntry {
 
 // Stat stats the named file from within the archive. If name is "." then
 // the archive file itself is statted and treated as a directory file.
-func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) {
+func (f *ArchiveFS) Stat(name string) (fs.FileInfo, error) {
 	if !fs.ValidPath(name) {
 		return nil, &fs.PathError{Op: "stat", Path: name, Err: fs.ErrInvalid}
 	}
@@ -521,20 +515,23 @@ func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) {
 // we call Open for each file, it's exponentially slow! Can we potentially add memory or something?
 
 // ReadDir reads the named directory from within the archive.
-func (f ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) {
+func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) {
 	if !fs.ValidPath(name) {
 		return nil, &fs.PathError{Op: "readdir", Path: name, Err: fs.ErrInvalid}
 	}
 
-	// fs.WalkDir calls ReadDir() once per directory, and for archives with
+	// fs.WalkDir() calls ReadDir() once per directory, and for archives with
 	// lots of directories, that is very slow, since we have to traverse the
-	// entire archive in order to know that we got all the entries for a
+	// entire archive in order to ensure that we got all the entries for a
 	// directory -- so we can fast-track this lookup if we've done the
 	// traversal already
-	if len(f.contents) > 0 {
-		return f.contents[name], nil
+	if len(f.dirs) > 0 {
+		return f.dirs[name], nil
 	}
 
+	f.contents = make(map[string]FileInfo)
+	f.dirs = make(map[string][]fs.DirEntry)
+
 	var archiveFile *os.File
 	var err error
 	if f.Stream == nil {
@@ -548,25 +545,63 @@ func (f ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) {
 	// apply prefix if fs is rooted in a subtree
 	name = path.Join(f.Prefix, name)
 
-	// collect all files with prefix
-	var (
-		files     []FileInfo
-		foundFile bool
-	)
 	handler := func(_ context.Context, file FileInfo) error {
-		file.NameInArchive = path.Clean(file.NameInArchive) // TODO: Should we always just "Clean" this inside Extract() for everyone?
+		// can't always trust path names
+		file.NameInArchive = path.Clean(file.NameInArchive)
 
-		// Apparently, creating a tar file in the target directory may result
-		// in an entry called "." in the archive; avoid infinite walk. see #384
+		// avoid infinite walk; apparently, creating a tar file in the target
+		// directory may result in an entry called "." in the archive; see #384
 		if file.NameInArchive == "." {
 			return nil
 		}
 
-		files = append(files, file)
+		// if the name being requested isn't a directory, return an error similar to
+		// what most OSes return from the readdir system call when given a non-dir
 		if file.NameInArchive == name && !file.IsDir() {
-			foundFile = true
-			return errStopWalk
+			return &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a directory")}
+		}
+
+		// index this file info for quick access
+		f.contents[file.NameInArchive] = file
+
+		// this is a real directory; prefer its DirEntry over an implicit/fake one we may have created earlier;
+		// first try to find if it exists, and if so, replace the value; otherwise insert it in sorted position
+		if file.IsDir() {
+			dirEntry := fs.FileInfoToDirEntry(file)
+			idx, found := slices.BinarySearchFunc(f.dirs[path.Dir(file.NameInArchive)], dirEntry, func(a, b fs.DirEntry) int {
+				return strings.Compare(a.Name(), b.Name())
+			})
+			if found {
+				f.dirs[path.Dir(file.NameInArchive)][idx] = dirEntry
+			} else {
+				f.dirs[path.Dir(file.NameInArchive)] = slices.Insert(f.dirs[path.Dir(file.NameInArchive)], idx, dirEntry)
+			}
+		}
+
+		// this loop looks like an abomination, but it's really quite simple: we're
+		// just iterating the directories of the path up to the root; i.e. we lob off
+		// the base (last component) of the path until no separators remain, i.e. only
+		// one component remains -- then loop again to make sure it's not a duplicate
+		for dir, base := path.Dir(file.NameInArchive), path.Base(file.NameInArchive); ; dir, base = path.Dir(dir), path.Base(dir) {
+			var dirInfo fs.DirEntry = implicitDirInfo{implicitDirEntry{base}}
+
+			// we are "filling in" any directories that could potentially be only implicit,
+			// and since a nested directory can have more than 1 item, we need to prevent
+			// duplication; for example: given a/b/c and a/b/d, we need to avoid adding
+			// an entry for "b" twice within "a" -- hence we search for it first, and if
+			// it doesn't already exist, we insert it in sorted position
+			idx, found := slices.BinarySearchFunc(f.dirs[dir], dirInfo, func(a, b fs.DirEntry) int {
+				return strings.Compare(a.Name(), b.Name())
+			})
+			if !found {
+				f.dirs[dir] = slices.Insert(f.dirs[dir], idx, dirInfo)
+			}
+
+			if dir == "." {
+				break
+			}
 		}
+
 		return nil
 	}
 
@@ -582,29 +617,34 @@ func (f ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) {
 	}
 
 	err = f.Format.Extract(f.context(), inputStream, filter, handler)
-	if foundFile {
-		return nil, &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a dir")}
-	}
 	if err != nil {
+		// these being non-nil implies that we have indexed the archive,
+		// but if an error occurred, we likely only got part of the way
+		// through and our index is incomplete, and we'd have to re-walk
+		// the whole thing anyway; so reset these to nil to avoid bugs
+		f.dirs = nil
+		f.contents = nil
 		return nil, err
 	}
 
-	// always find all implicit directories
-	files = fillImplicit(files)
-	// and return early for dot file
-	if name == "." {
-		return openReadDir(name, files), nil
-	}
+	// // always find all implicit directories
+	// files = fillImplicit(files)
+	// // and return early for dot file
+	// if name == "." {
+	// 	return openReadDir(name, files), nil
+	// }
 
-	file, foundFile := search(name, files)
-	if !foundFile {
-		return nil, fs.ErrNotExist
-	}
+	// file, foundFile := search(name, files)
+	// if !foundFile {
+	// 	return nil, fs.ErrNotExist
+	// }
 
-	if !file.IsDir() {
-		return nil, &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a dir")}
-	}
-	return openReadDir(name, files), nil
+	// if !file.IsDir() {
+	// 	return nil, &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a dir")}
+	// }
+	// return openReadDir(name, files), nil
+
+	return f.dirs[name], nil
 }
 
 // Sub returns an FS corresponding to the subtree rooted at dir.
@@ -619,6 +659,11 @@ func (f *ArchiveFS) Sub(dir string) (fs.FS, error) {
 	if !info.IsDir() {
 		return nil, fmt.Errorf("%s is not a directory", dir)
 	}
+	// result is the same as what we're starting with, except
+	// we indicate a path prefix to be used for all operations;
+	// the reason we don't append to the Path field directly
+	// is because the input might be a stream rather than a
+	// path on disk, and the Prefix field is applied on both
 	result := f
 	result.Prefix = dir
 	return result, nil
@@ -695,16 +740,6 @@ func pathWithoutTopDir(fpath string) string {
 // traversed during the walk.
 var errStopWalk = fmt.Errorf("stop walk")
 
-type fakeArchiveFile struct{}
-
-func (f fakeArchiveFile) Stat() (fs.FileInfo, error) {
-	return implicitDirInfo{
-		implicitDirEntry{name: "."},
-	}, nil
-}
-func (f fakeArchiveFile) Read([]byte) (int, error) { return 0, io.EOF }
-func (f fakeArchiveFile) Close() error             { return nil }
-
 // dirFile implements the fs.ReadDirFile interface.
 type dirFile struct {
 	extractedFile
diff --git a/fs_test.go b/fs_test.go
index 9180fbf3..bbb56b55 100644
--- a/fs_test.go
+++ b/fs_test.go
@@ -64,7 +64,7 @@ func TestSelfTar(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Could not stat test tar: %v", fn)
 	}
-	fsys := ArchiveFS{
+	fsys := &ArchiveFS{
 		Stream: io.NewSectionReader(fh, 0, fstat.Size()),
 		Format: Tar{},
 	}
@@ -83,7 +83,7 @@ func TestSelfTar(t *testing.T) {
 }
 
 func ExampleArchiveFS_Stream() {
-	fsys := ArchiveFS{
+	fsys := &ArchiveFS{
 		Stream: io.NewSectionReader(bytes.NewReader(testZIP), 0, int64(len(testZIP))),
 		Format: Zip{},
 	}