From bb3cf93479773cb3660695c74746fef739714b49 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 6 Dec 2023 01:46:41 -0500 Subject: [PATCH] extract --- bwt/bwt.go | 37 +++++++++++++++++-- bwt/bwt_test.go | 96 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 105 insertions(+), 28 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 45875b05..c17716d6 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -39,6 +39,30 @@ func (bwt BWT) Locate(pattern string) []int { return offsets } +// TODO: do we want to ignore the $? +func (bwt BWT) Extract(start, end int) string { + if end > bwt.getLenOfOriginalString() { + panic("figure out what we want to do here") + } + + strB := strings.Builder{} + for i := start; i < end; i++ { + fPos := bwt.reverseCharacterLookup(i) + skip := bwt.lookupSkipByOffset(fPos) + strB.WriteByte(skip.char) + } + return strB.String() +} + +func (bwt BWT) reverseCharacterLookup(originalPos int) int { + for i := range bwt.suffixArray { + if bwt.suffixArray[i] == originalPos { + return i + } + } + panic("figure out what to do here") +} + func (bwt BWT) lfSearch(pattern string) interval { searchRange := interval{start: 0, end: bwt.getLenOfOriginalString()} for i := 0; i < len(pattern); i++ { @@ -47,7 +71,7 @@ func (bwt BWT) lfSearch(pattern string) interval { } c := pattern[len(pattern)-1-i] - skip, ok := bwt.lookupSkip(c) + skip, ok := bwt.lookupSkipByChar(c) if !ok { return interval{} } @@ -57,7 +81,7 @@ func (bwt BWT) lfSearch(pattern string) interval { return searchRange } -func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { +func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) { for i := range bwt.skipList { if bwt.skipList[i].char == c { return bwt.skipList[i], true @@ -66,6 +90,15 @@ func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { return skipEntry{}, false } +func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { + for i := range bwt.skipList { + if bwt.skipList[i].openEndedInterval.start <= offset && offset < bwt.skipList[i].openEndedInterval.end { + return bwt.skipList[i] + } + } + panic("figure out what to do here") +} + func (bwt BWT) getLenOfOriginalString() int { return bwt.skipList[len(bwt.skipList)-1].openEndedInterval.end } diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index fd3d2cda..ca58beca 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -77,36 +77,80 @@ func TestBWT_Locate(t *testing.T) { } } -func BenchmarkBWTBuildPower12(b *testing.B) { - base := "!BANANA!" - BaseBenchmarkBWTBuild(base, 12, b) +type BWTExtractTestCase struct { + start int + end int + expected string } -//go:noinline -func BaseBenchmarkBWTBuild(base string, power int, b *testing.B) { - for n := 0; n < b.N; n++ { - buildBWTForBench(base, power) - } -} - -func buildBWTForBench(base string, power int) BWT { - test := base - for i := 0; i < power; i++ { - test += test - } +func TestBWT_Extract(t *testing.T) { + baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 + testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") - return New(test) -} + bwt := New(testStr) -func BenchmarkBWTQueryPower12(b *testing.B) { - base := "!BANANA!" - bwt := buildBWTForBench(base, 12) - BaseBenchmarkBWTQuery(bwt, "ANANABANANA", b) -} + testTable := []BWTExtractTestCase{ + {4, 8, "uick"}, + {117, 121, "uick"}, + {230, 234, "uick"}, + {0, 3, "the"}, + {25, 28, "the"}, + {113, 116, "the"}, + {138, 141, "the"}, + {226, 229, "the"}, + {251, 254, "the"}, + {21, 25, "over"}, + {41, 45, "over"}, + {134, 138, "over"}, + {154, 158, "over"}, + {247, 251, "over"}, + {267, 271, "over"}, + {10, 13, "own"}, + {48, 51, "own"}, + {106, 109, "own"}, + {123, 126, "own"}, + {161, 164, "own"}, + {219, 222, "own"}, + {223, 226, "own"}, + {236, 239, "own"}, + {274, 277, "own"}, + {332, 335, "own"}, + {336, 339, "own"}, + {87, 90, "ana"}, + {89, 92, "ana"}, + {200, 203, "ana"}, + {202, 205, "ana"}, + {313, 316, "ana"}, + {315, 318, "ana"}, + {39, 41, "an"}, + {87, 89, "an"}, + {152, 154, "an"}, + {200, 202, "an"}, + {202, 204, "an"}, + {265, 267, "an"}, + {313, 315, "an"}, + {50, 52, "na"}, + {88, 90, "na"}, + {163, 165, "na"}, + {201, 203, "na"}, + {203, 205, "na"}, + {276, 278, "na"}, + {314, 316, "na"}, + {316, 318, "na"}, + {9, 13, "rown"}, + {47, 51, "rown"}, + {122, 126, "rown"}, + {160, 164, "rown"}, + {235, 239, "rown"}, + {273, 277, "rown"}, + {109, 116, "townthe"}, + {222, 229, "townthe"}, + } -//go:noinline -func BaseBenchmarkBWTQuery(bwt BWT, seq string, b *testing.B) { - for n := 0; n < b.N; n++ { - bwt.Count(seq) + for _, v := range testTable { + str := bwt.Extract(v.start, v.end) + if str != v.expected { + t.Fatalf("extractRange=(%d, %d) expected=%s actual=%s", v.start, v.end, v.expected, str) + } } }