From 8c98c3933c5a6c807f4d60faece945ea75c4dfe6 Mon Sep 17 00:00:00 2001 From: Kyle Xiao Date: Fri, 12 Jul 2024 18:43:01 +0800 Subject: [PATCH] feat: new strmap --- container/strmap/strmap.go | 180 ++++++++++++++++++++++++++++++++ container/strmap/strmap_test.go | 115 ++++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 container/strmap/strmap.go create mode 100644 container/strmap/strmap_test.go diff --git a/container/strmap/strmap.go b/container/strmap/strmap.go new file mode 100644 index 0000000..d9bf5da --- /dev/null +++ b/container/strmap/strmap.go @@ -0,0 +1,180 @@ +package strmap + +import ( + "fmt" + "sort" + "strings" +) + +// StrMap represents GC friendly string map implementation. +// it's readonly after it's created +type StrMap struct { + data []byte + items []mapItem + + hashtable []int +} + +type mapItem struct { + off int + sz int + slot int + v uintptr +} + +// New creates StrMap from map[string]uintptr +// uintptr can be any value and it will be returned by Get. +func New(m map[string]uintptr) *StrMap { + sz := 0 + for k, _ := range m { + sz += len(k) + } + b := make([]byte, 0, sz) + items := make([]mapItem, 0, len(m)) + for k, v := range m { + items = append(items, mapItem{off: len(b), sz: len(k), slot: int(fnvhashstr(k)), v: v}) + b = append(b, k...) + } + ret := &StrMap{data: b, items: items} + ret.makeHashtable() + return ret +} + +// Len returns the size of map +func (m *StrMap) Len() int { + return len(m.items) +} + +func (m *StrMap) makeHashtable() { + slots := getHashtableSlots(len(m.items)) + m.hashtable = make([]int, slots) + + for i := range m.items { + m.items[i].slot = m.items[i].slot % slots + } + + // make sure items with the same slot stored together + // good for cpu cache + sort.Slice(m.items, func(i, j int) bool { + return m.items[i].slot < m.items[j].slot + }) + + for i := 0; i < len(m.hashtable); i++ { + m.hashtable[i] = -1 + } + for i := range m.items { + e := &m.items[i] + if m.hashtable[e.slot] < 0 { + // we only need to store the 1st item if hash conflict + // since they're already stored together + // will check the next item when Get + m.hashtable[e.slot] = i + } + } +} + +// Get ... +func (m *StrMap) Get(s string) (uintptr, bool) { + slot := int(fnvhashstr(s)) % len(m.hashtable) + i := m.hashtable[slot] + if i < 0 { + return 0, false + } + e := &m.items[i] + for { + if string(m.data[e.off:e.off+e.sz]) == s { // double check + return e.v, true + } + i++ + if i >= len(m.items) { + break + } + e = &m.items[i] + if e.slot != slot { + break + } + } + return 0, false +} + +func (m *StrMap) String() string { + b := &strings.Builder{} + b.WriteByte('[') + for i, e := range m.items { + if i != 0 { + b.WriteString(", ") + } + fmt.Fprintf(b, "{off:%d, slot:%x, str:%q}", e.off, e.slot, string(m.data[e.off:e.off+e.sz])) + } + b.WriteByte(']') + return b.String() +} + +const ( + fnvHashOffset32 = uint32(2166136261) + fnvHashPrime32 = uint32(16777619) +) + +func fnvhashstr(s string) uint32 { + h := fnvHashOffset32 + for i := 0; i < len(s); i++ { + h *= fnvHashPrime32 + h ^= uint32(s[i]) + } + return h +} + +var bits2primes = []int{ + 0: 17, // 1 + 1: 17, // 2 + 2: 17, // 4 + 3: 17, // 8 + 4: 17, // at least 17 for <= 16 + 5: 31, // 32 + 6: 61, // 64 + 7: 127, // 128 + 8: 251, // 256 + 9: 509, // 512 + 10: 1021, // 1024 + 11: 2039, // 2048 + 12: 4093, // 4096 + 13: 8191, // 8192 + 14: 16381, // 16384 + 15: 32749, // 32768 + 16: 65521, // 65536 + 17: 131071, // 131072 + 18: 262139, // 262144 + 19: 524287, // 524288 + 20: 1048573, // 1048576 + 21: 2097143, // 2097152 + 22: 4194301, // 4194304 + 23: 8388593, // 8388608 + 24: 16777213, // 16777216 + 25: 33554393, // 33554432 + 26: 67108859, // 67108864 + 27: 134217689, // 134217728 + 28: 268435399, // 268435456 + 29: 536870909, // 536870912 + 30: 1073741789, // 1073741824 +} + +func getHashtableSlots(n int) int { + // load factor + n = int(float32(n) / 0.75) + + // count bits to decide which prime number to use + bits := 0 + for v := uint64(n); v > 0; v = v >> 1 { + bits++ + } + + // add one more bit, + // so if n=1500, than returns 2039 instead of 1021 + bits++ + + if bits > len(bits2primes) { + // ???? are you sure we need to hold so many items? ~ 1B items for 30 bits + return n + } + return bits2primes[bits] // a prime bigger than n +} diff --git a/container/strmap/strmap_test.go b/container/strmap/strmap_test.go new file mode 100644 index 0000000..db5ebf0 --- /dev/null +++ b/container/strmap/strmap_test.go @@ -0,0 +1,115 @@ +package strmap + +import ( + "runtime" + "testing" + "time" + "unsafe" + + "github.com/stretchr/testify/require" +) + +var randseed = uint64(time.Now().UnixNano()) + +func rand64() uint64 { // Xorshift64 without * + randseed ^= randseed >> 12 + randseed ^= randseed << 25 + randseed ^= randseed >> 27 + return randseed +} + +func randRead(b []byte) { + i := 0 + for ; i+8 <= len(b); i += 8 { + *(*uint64)(unsafe.Pointer(&b[i])) = rand64() + } + v := rand64() + for ; i < len(b); i++ { + b[i] = byte(v) + v = v >> 8 + } +} + +func randIntn(n int) int { + return int(rand64()>>1) % n +} + +func randStrings(m, n int) []string { + b := make([]byte, m) + ret := make([]string, 0, n) + for i := 0; i < n; i++ { + randRead(b) + ret = append(ret, string(b[:1+randIntn(m)])) + } + return ret +} + +func newStdStrMap(ss []string) map[string]uintptr { + v := uintptr(1) + m := make(map[string]uintptr) + for _, s := range ss { + _, ok := m[s] + if !ok { + m[s] = v + v++ + } + } + return m +} + +func TestStrMap(t *testing.T) { + ss := randStrings(20, 100000) + m := newStdStrMap(ss) + strset := New(m) + require.Equal(t, len(m), strset.Len()) + for i, s := range ss { + v0 := m[s] + v1, _ := strset.Get(s) + require.Equal(t, v0, v1, i) + } +} + +func Benchmark_StrMap(b *testing.B) { + ss := randStrings(20, 100000) + m := newStdStrMap(ss) + strset := New(m) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = strset.Get(ss[i%len(ss)]) + } +} + +func Benchmark_StdMap(b *testing.B) { + ss := randStrings(20, 100000) + m := newStdStrMap(ss) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = m[ss[i%len(ss)]] + } +} + +func Benchmark_StrMap_GC(b *testing.B) { + ss := randStrings(50, 1000000) + m := newStdStrMap(ss) + strset := New(m) + ss = nil + m = nil + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + runtime.GC() + } + runtime.KeepAlive(strset) +} + +func Benchmark_StdMap_GC(b *testing.B) { + ss := randStrings(50, 1000000) + m := newStdStrMap(ss) + ss = nil + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + runtime.GC() + } + runtime.KeepAlive(m) +}