Skip to content

Commit

Permalink
Replace (custom) base64 encoding of paths with simple quoting.
Browse files Browse the repository at this point in the history
  • Loading branch information
mjkw31 committed Oct 29, 2024
1 parent 6577e26 commit 725969f
Show file tree
Hide file tree
Showing 20 changed files with 318 additions and 364 deletions.
34 changes: 19 additions & 15 deletions cmd/stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,34 @@ import (
"github.com/wtsi-ssg/wrstat/v5/summary"
)

const reportFrequency = 10 * time.Minute
const statOutputFileSuffix = ".stats"
const statUserGroupSummaryOutputFileSuffix = ".byusergroup"
const statGroupSummaryOutputFileSuffix = ".bygroup"
const statDGUTSummaryOutputFileSuffix = ".dgut"
const statLogOutputFileSuffix = ".log"
const lstatTimeout = 10 * time.Second
const lstatAttempts = 3

var statDebug bool
var statCh string
const (
reportFrequency = 10 * time.Minute
statOutputFileSuffix = ".stats"
statUserGroupSummaryOutputFileSuffix = ".byusergroup"
statGroupSummaryOutputFileSuffix = ".bygroup"
statDGUTSummaryOutputFileSuffix = ".dgut"
statLogOutputFileSuffix = ".log"
lstatTimeout = 10 * time.Second
lstatAttempts = 3
)

var (
statDebug bool
statCh string
)

// statCmd represents the stat command.
var statCmd = &cobra.Command{
Use: "stat",
Short: "Stat paths",
Long: `Stat paths in a given file.
Given a file containing a base64 encoded absolute file path per line (eg. as
produced by 'wrstat walk'), this creates a new file with stats for each of those
file paths. The new file is named after the input file with a ".stats" suffix.
Given a file containing a quoted absolute file path per line (eg. as produced
by 'wrstat walk'), this creates a new file with stats for each of those file
paths. The new file is named after the input file with a ".stats" suffix.
The output file format is 11 tab separated columns with the following contents:
1. Base64 encoded path to the file.
1. Quoted path to the file.
2. File size in bytes. If this is greater than the number of bytes in blocks
allocated, this will be the number of bytes in allocated blocks. (This is to
account for files with holes in them; as a byproduct, symbolic links will
Expand Down
26 changes: 14 additions & 12 deletions cmd/walk.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,14 @@ const (
)

// options for this cmd.
var outputDir string
var depGroup string
var walkInodesPerJob int
var walkNumOfJobs int
var walkID string
var walkCh string
var (
outputDir string
depGroup string
walkInodesPerJob int
walkNumOfJobs int
walkID string
walkCh string
)

// walkCmd represents the walk command.
var walkCmd = &cobra.Command{
Expand All @@ -68,11 +70,10 @@ user that can sudo without a password when running wrstat, and supply the --sudo
option to this command.
For each entry recursively within the directory of interest, their paths are
quickly retrieved (without doing any expensive stat calls) and written (base64
encoded) to output files in the given output directory. The number of files is
such that they will each contain about --inodes_per_stat entries (or if
--num_stats was supplied greater than zero, then there will be that number of
output files).
quickly retrieved (without doing any expensive stat calls) and written (quoted)
to output files in the given output directory. The number of files is such that
they will each contain about --inodes_per_stat entries (or if --num_stats was
supplied greater than zero, then there will be that number of output files).
For each output file, a 'wrstat stat' job is then added to wr's queue with the
given dependency group. For the meaning of the --ch option which is passed
Expand Down Expand Up @@ -149,7 +150,8 @@ func statRepGrp(dir, unique string) string {

// walkDirAndScheduleStats does the main work.
func walkDirAndScheduleStats(desiredDir, outputDir string, statJobs, inodes int, depGroup, repGroup,
yamlPath string, s *scheduler.Scheduler) {
yamlPath string, s *scheduler.Scheduler,
) {
n := statJobs
if n == 0 {
n = calculateSplitBasedOnInodes(inodes, desiredDir)
Expand Down
9 changes: 5 additions & 4 deletions combine/dgut_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"testing"

. "github.com/smartystreets/goconvey/convey"
"github.com/wtsi-ssg/wrstat/v5/dgut"
"github.com/wtsi-ssg/wrstat/v5/fs"
"github.com/wtsi-ssg/wrstat/v5/internal/encode"
"github.com/wtsi-ssg/wrstat/v5/summary"
)

Expand Down Expand Up @@ -116,13 +116,14 @@ func buildDGUTFiles(t *testing.T) ([]string, string, string, string) {
// /lustre 1313 13912 0 1 0 1668768807
// /lustre/scratch123 1313 13912 0 1 0 1668768807.
func buildDGUTContent(directory, gid, uid string, filetype, nestedFiles,
fileSize, oldestAtime, newestAtime int) string {
fileSize, oldestAtime, newestAtime int,
) string {
var DGUTContents string

splitDir := recursivePath(directory)

for _, split := range splitDir {
DGUTContents += encode.Base64Encode(split) + fmt.Sprintf("\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
DGUTContents += strconv.Quote(split) + fmt.Sprintf("\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
gid, uid, filetype, nestedFiles, fileSize, oldestAtime, newestAtime)
}

Expand All @@ -137,7 +138,7 @@ func recursivePath(path string) []string {
count := strings.Count(path, "/")
newPath := path

var DGUTContents = make([]string, count+1)
DGUTContents := make([]string, count+1)
DGUTContents[count] = path

for i := count - 1; i >= 0; i-- {
Expand Down
7 changes: 3 additions & 4 deletions combine/stat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"testing"

. "github.com/smartystreets/goconvey/convey"
"github.com/wtsi-ssg/wrstat/v5/fs"
"github.com/wtsi-ssg/wrstat/v5/internal/encode"
)

// TestStatFiles tests that the stat files concatenate and compress properly.
Expand All @@ -53,7 +53,7 @@ func TestStatFiles(t *testing.T) {
actualContent, err := fs.ReadCompressedFile(outputPath)
So(err, ShouldBeNil)

encodedDir := encode.Base64Encode(dir)
encodedDir := strconv.Quote(dir)

expectedOutput := fmt.Sprintf(
"%s\t5\t345\t152\t217434\t82183\t147\t'f'\t3\t7\t28472\t\n"+
Expand Down Expand Up @@ -82,7 +82,7 @@ func buildStatFiles(t *testing.T) (string, []*os.File, *os.File, string) {

_, err = f.WriteString(fmt.Sprintf(
"%s\t%d\t%d\t%d\t%d\t%d\t%d\t%q\t%d\t%d\t%d\t\n",
encode.Base64Encode(dir),
strconv.Quote(dir),
5+i,
345,
152,
Expand All @@ -93,7 +93,6 @@ func buildStatFiles(t *testing.T) (string, []*os.File, *os.File, string) {
3+i,
7,
28472))

if err != nil {
t.Fatal(err)
}
Expand Down
41 changes: 23 additions & 18 deletions dgut/dgut_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,48 +28,48 @@ package dgut
import (
"math"
"os"
"strconv"
"strings"
"testing"

. "github.com/smartystreets/goconvey/convey"
"github.com/ugorji/go/codec"
internaldata "github.com/wtsi-ssg/wrstat/v5/internal/data"
"github.com/wtsi-ssg/wrstat/v5/internal/encode"
"github.com/wtsi-ssg/wrstat/v5/summary"
bolt "go.etcd.io/bbolt"
)

func TestDGUT(t *testing.T) {
Convey("You can parse a single line of dgut data", t, func() {
line := encode.Base64Encode("/") + "\t1\t101\t0\t3\t30\t50\t50\n"
line := strconv.Quote("/") + "\t1\t101\t0\t3\t30\t50\t50\n"
dir, gut, err := parseDGUTLine(line)
So(err, ShouldBeNil)
So(dir, ShouldEqual, "/")
So(gut, ShouldResemble, &GUT{GID: 1, UID: 101, FT: 0, Count: 3, Size: 30, Atime: 50, Mtime: 50})

Convey("But invalid data won't parse", func() {
_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\t101\t0\t3\t50\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\t101\t0\t3\t50\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\tfoo\t101\t0\t3\t30\t50\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\tfoo\t101\t0\t3\t30\t50\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\tfoo\t0\t3\t30\t50\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\tfoo\t0\t3\t30\t50\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\t101\tfoo\t3\t30\t50\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\t101\tfoo\t3\t30\t50\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\t101\t0\tfoo\t30\t50\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\t101\t0\tfoo\t30\t50\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\t101\t0\t3\tfoo\t50\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\t101\t0\t3\tfoo\t50\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\t101\t0\t3\t30\tfoo\t50\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\t101\t0\t3\t30\tfoo\t50\n")
So(err, ShouldEqual, ErrInvalidFormat)

_, _, err = parseDGUTLine(encode.Base64Encode("/") + "\t1\t101\t0\t3\t30\t50\tfoo\n")
_, _, err = parseDGUTLine(strconv.Quote("/") + "\t1\t101\t0\t3\t30\t50\tfoo\n")
So(err, ShouldEqual, ErrInvalidFormat)

So(err.Error(), ShouldEqual, "the provided data was not in dgut format")
Expand Down Expand Up @@ -147,8 +147,10 @@ func TestDGUT(t *testing.T) {

expectedUIDs := []uint32{101, 102}
expectedGIDs := []uint32{1, 2}
expectedFTs := []summary.DirGUTFileType{summary.DGUTFileTypeTemp,
summary.DGUTFileTypeBam, summary.DGUTFileTypeCram, summary.DGUTFileTypeDir}
expectedFTs := []summary.DirGUTFileType{
summary.DGUTFileTypeTemp,
summary.DGUTFileTypeBam, summary.DGUTFileTypeCram, summary.DGUTFileTypeDir,
}

const numDirectories = 10

Expand Down Expand Up @@ -321,7 +323,8 @@ func TestDGUT(t *testing.T) {
c, s, a, m, u, g, t, _, errd = db.DirInfo("/", &Filter{
GIDs: []uint32{1},
UIDs: []uint32{102},
FTs: []summary.DirGUTFileType{summary.DGUTFileTypeTemp}})
FTs: []summary.DirGUTFileType{summary.DGUTFileTypeTemp},
})
So(errd, ShouldBeNil)
So(c, ShouldEqual, 0)
So(s, ShouldEqual, 0)
Expand Down Expand Up @@ -381,9 +384,9 @@ func TestDGUT(t *testing.T) {
})

Convey("Store()ing multiple times", func() {
data = strings.NewReader(encode.Base64Encode("/") + "\t3\t103\t7\t2\t2\t25\t25\n" +
encode.Base64Encode("/a/i") + "\t3\t103\t7\t1\t1\t25\t25\n" +
encode.Base64Encode("/i") + "\t3\t103\t7\t1\t1\t30\t30\n")
data = strings.NewReader(strconv.Quote("/") + "\t3\t103\t7\t2\t2\t25\t25\n" +
strconv.Quote("/a/i") + "\t3\t103\t7\t1\t1\t25\t25\n" +
strconv.Quote("/i") + "\t3\t103\t7\t1\t1\t30\t30\n")

Convey("to the same db file doesn't work", func() {
err = db.Store(data, 4)
Expand Down Expand Up @@ -618,8 +621,10 @@ func testData(t *testing.T) (dgutData string, expectedRootGUTs GUTs, expected []
},
}

expectedKeys = []string{"/", "/a", "/a/b", "/a/b/d", "/a/b/d/f",
"/a/b/d/g", "/a/b/e", "/a/b/e/h", "/a/b/e/h/tmp", "/a/c", "/a/c/d"}
expectedKeys = []string{
"/", "/a", "/a/b", "/a/b/d", "/a/b/d/f",
"/a/b/d/g", "/a/b/e", "/a/b/e/h", "/a/b/e/h/tmp", "/a/c", "/a/c/d",
}

return dgutData, expectedRootGUTs, expected, expectedKeys
}
Expand Down
9 changes: 5 additions & 4 deletions dgut/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@ import (
"strconv"
"strings"

"github.com/wtsi-ssg/wrstat/v5/internal/encode"
"github.com/wtsi-ssg/wrstat/v5/summary"
)

type Error string

func (e Error) Error() string { return string(e) }

const ErrInvalidFormat = Error("the provided data was not in dgut format")
const ErrBlankLine = Error("the provided line had no information")
const (
ErrInvalidFormat = Error("the provided data was not in dgut format")
ErrBlankLine = Error("the provided line had no information")
)

const (
gutDataCols = 8
Expand Down Expand Up @@ -114,7 +115,7 @@ func parseDGUTLine(line string) (string, *GUT, error) {
return "", nil, ErrBlankLine
}

path, err := encode.Base64Decode(parts[0])
path, err := strconv.Unquote(parts[0])
if err != nil {
return "", nil, err
}
Expand Down
Loading

0 comments on commit 725969f

Please sign in to comment.