Skip to content

Commit

Permalink
kyle-wip
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaost committed Jul 12, 2024
1 parent a73b0c9 commit 53467ce
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 0 deletions.
119 changes: 119 additions & 0 deletions container/strset/strset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package strmap

import (
"fmt"
"sort"
"strings"
)

// StrSet represents GC friendly string set implementation.
// The pseudo code:
//
// off := 0
// m := make(map[string]int)
// for _, s := range ss {
// m[s] = off
// off += len(s)
// }
//
// it only supports Get method after StrSet is created.
// it's useful for scenario like `map[string]YourStruct` where YourStruct contains no pointer
// so you can change it to `map[int]YourStruct` and use StrSet for keys for better performance.
// you can also it for replacing `map[string]struct{}{}`
type StrSet struct {
data []byte
items []strItem
}

type strItem struct {
off int
sz int
hash uint32
}

// New creates a string set from []string
func New(ss []string) *StrSet {
sz := 0
for _, s := range ss {
sz += len(s)
}
b := make([]byte, 0, sz)
items := make([]strItem, len(ss))
dup := make(map[string]struct{}, len(ss))
for i, s := range ss {
_, ok := dup[s]
if ok {
continue
}
dup[s] = struct{}{}
e := &items[i]
e.off = len(b)
e.sz = len(s)
e.hash = fnvhashstr(s)
b = append(b, s...)
}
ret := &StrSet{data: b, items: items}
ret.sort()
return ret
}

func (m *StrSet) sort() {
sort.Slice(m.items, func(i, j int) bool {
return m.items[i].hash < m.items[j].hash
})
}

// Get returns an ID for s, -1 if not found
func (m *StrSet) Get(s string) int {
h := fnvhashstr(s)
// binary search
i, j := 0, len(m.items)
for i < j {
p := int(uint(i+j) >> 1) // (i+j)/2
// i ≤ p < j
if h > m.items[p].hash {
i = p + 1
} else {
j = p
}
}
// i == j, check if found
for i < len(m.items) {
e := &m.items[i]
if e.hash != h {
return -1
}
if string(m.data[e.off:e.off+e.sz]) == s { // double check
return e.off
}
i++ // hash conflict
}
return -1
}

func (m *StrSet) String() string {
b := strings.Builder{}
b.WriteByte('[')
for i, e := range m.items {
if i != 0 {
b.WriteString(", ")
}
fmt.Fprintf(&b, "{off:%d, hash:%x, str:%q}", e.off, e.hash, string(m.data[e.off:e.off+e.sz]))
}
b.WriteByte(']')
return b.String()
}

const (
fnvHashOffset32 = uint32(2166136261)
fnvHashPrime32 = uint32(16777619)
)

func fnvhashstr(s string) uint32 {
h := fnvHashOffset32
for i := 0; i < len(s); i++ {
h *= fnvHashPrime32
h ^= uint32(s[i])
}
return h
}
50 changes: 50 additions & 0 deletions container/strset/strset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package strmap

import (
"math/rand"
"testing"
"time"

"github.com/stretchr/testify/require"
)

var letters = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")

func randString(n int) string {
b := make([]rune, n)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}

func randStrings(m, n int) []string {
seed := time.Now().UnixNano()
seed = 1720779864101503000
println(seed)
r := rand.New(rand.NewSource(seed))
b := make([]byte, m)
ret := make([]string, 0, n)
for i := 0; i < n; i++ {
r.Read(b)
ret = append(ret, string(b[:1+r.Intn(m)]))
}
return ret
}

func TestStrMap(t *testing.T) {
ss := randStrings(5, 100)
off := 0
m := make(map[string]int)
for _, s := range ss {
_, ok := m[s]
if !ok {
m[s] = off
off += len(s)
}
}
strset := New(ss)
for i, s := range ss {
require.Equal(t, m[s], strset.Get(s), i)
}
}

0 comments on commit 53467ce

Please sign in to comment.