Skip to content

Commit

Permalink
feat(strmap): new pkg for optimizing GC issues
Browse files Browse the repository at this point in the history
Usage:
```
m := make(map[string]bool)
// populate m
// ...

sm := strmap.New(m)
m = nil // no need the original map

v, ok := sm.Get(key) // it replaces v, ok := m[key]
```

Benchmark:
```
BenchmarkGC/std-keysize_20_n_100000-12          	     126	    950226 ns/op
BenchmarkGC/new-keysize_20_n_100000-12          	    1069	    110980 ns/op
BenchmarkGC/std-keysize_100_n_100000-12         	     145	    888562 ns/op
BenchmarkGC/new-keysize_100_n_100000-12         	    1023	    112419 ns/op
BenchmarkGC/std-keysize_20_n_400000-12          	      87	   2935875 ns/op
BenchmarkGC/new-keysize_20_n_400000-12          	    1032	    112177 ns/op
BenchmarkGC/std-keysize_100_n_400000-12         	      46	   2813522 ns/op
BenchmarkGC/new-keysize_100_n_400000-12         	    1054	    110864 ns/op
```

This change also deprecated xfnv which is an experimental impl,
and it's not good enough in terms of distribution
  • Loading branch information
xiaost committed Sep 16, 2024
1 parent c2f7170 commit b2fc661
Show file tree
Hide file tree
Showing 8 changed files with 487 additions and 153 deletions.
167 changes: 167 additions & 0 deletions container/strmap/strmap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
* Copyright 2024 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package strmap

import (
"fmt"
"math"
"sort"
"strings"

"github.com/cloudwego/gopkg/internal/hack"
"github.com/cloudwego/gopkg/internal/hash/maphash"
)

// StrMap represents GC friendly readonly string map implementation.
// type V must NOT contain pointer for performance concern.
type StrMap[V any] struct {
// `data` holds bytes of keys
data []byte

// `items` holds key meta
items []mapItem[V]

// max hashtable ~ 2 billions which means len(items) < the num as well.
hashtable []int32 // using int32 for mem efficiency

// for maphash
seed maphash.Seed
}

type mapItem[V any] struct {
off int
sz uint32 // 4GB, big enough for key
slot uint32
v V
}

// New creates StrMap from map[string]V
func New[V any](m map[string]V) *StrMap[V] {
sz := 0
for k := range m {
sz += len(k)
}
b := make([]byte, 0, sz)

seed := maphash.MakeSeed()
items := make([]mapItem[V], 0, len(m))
for k, v := range m {
if len(k) > math.MaxUint32 {
// it doesn't make sense ...
panic("key too large")
}
items = append(items, mapItem[V]{off: len(b), sz: uint32(len(k)), slot: uint32(maphash.String(seed, k)), v: v})
b = append(b, k...)
}

ret := &StrMap[V]{data: b, items: items, seed: seed}
ret.makeHashtable()
return ret
}

// Len returns the size of map
func (m *StrMap[V]) Len() int {
return len(m.items)
}

// Item returns the i'th item in map.
// It panics if i is not in the range [0, Len()).
func (m *StrMap[V]) Item(i int) (string, V) {
e := &m.items[i]
return hack.ByteSliceToString(m.data[e.off : e.off+int(e.sz)]), e.v
}

type itemsBySlot[V any] []mapItem[V]

func (x itemsBySlot[V]) Len() int { return len(x) }
func (x itemsBySlot[V]) Less(i, j int) bool { return x[i].slot < x[j].slot }
func (x itemsBySlot[V]) Swap(i, j int) { x[i], x[j] = x[j], x[i] }

func (m *StrMap[V]) makeHashtable() {
slots := calcHashtableSlots(len(m.items))
m.hashtable = make([]int32, slots)

// update `slot` of mapItem to fit the size of hashtable
for i := range m.items {
m.items[i].slot = m.items[i].slot % uint32(slots)
}

// make sure items with the same slot stored together
// good for cpu cache
sort.Sort(itemsBySlot[V](m.items))

for i := 0; i < len(m.hashtable); i++ {
m.hashtable[i] = -1
}
for i := range m.items {
e := &m.items[i]
if m.hashtable[e.slot] < 0 {
// we only need to store the 1st item if hash conflict
// since they're already stored together
// will check the next item when Get
m.hashtable[e.slot] = int32(i)
}
}
}

// Get ...
func (m *StrMap[V]) Get(s string) (t V, ok bool) {
slot := uint32(maphash.String(m.seed, s)) % uint32(len(m.hashtable))
i := m.hashtable[slot]
if i < 0 {
return t, false
}
e := &m.items[i]
if string(m.data[e.off:e.off+int(e.sz)]) == s {
return e.v, true
}

// collision, worst O(n)
// coz i always point to the 1st item with the same slot,
// can scan till m.items ends or e.slot != slot.
for j := i + 1; j < int32(len(m.items)); j++ {
e = &m.items[j]
if e.slot != slot {
break
}
if string(m.data[e.off:e.off+int(e.sz)]) == s {
return e.v, true
}
}
return t, false
}

// String ...
func (m *StrMap[V]) String() string {
b := &strings.Builder{}
b.WriteString("{\n")
for _, e := range m.items {
fmt.Fprintf(b, "%q: %v,\n", string(m.data[e.off:e.off+int(e.sz)]), e.v)
}
b.WriteString("}")
return b.String()
}

func (m *StrMap[V]) debugString() string {
b := &strings.Builder{}
b.WriteString("{\n")
for _, e := range m.items {
fmt.Fprintf(b, "{off:%d, slot:%x, str:%q, v:%v},\n", e.off, e.slot, string(m.data[e.off:e.off+int(e.sz)]), e.v)
}
fmt.Fprintf(b, "}(slots=%d, items=%d)", len(m.hashtable), len(m.items))
return b.String()
}
140 changes: 140 additions & 0 deletions container/strmap/strmap_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* Copyright 2024 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package strmap

import (
"crypto/rand"
"fmt"
"runtime"
"testing"

"github.com/cloudwego/gopkg/internal/hack"
"github.com/stretchr/testify/require"
)

func randStrings(m, n int) []string {
b := make([]byte, m*n)
rand.Read(b)
ret := make([]string, 0, n)
for i := 0; i < n; i++ {
s := b[m*i:]
s = s[:m]
ret = append(ret, hack.ByteSliceToString(s))
}
return ret
}

// newStdStrMap generates a map with uniq values
func newStdStrMap(ss []string) map[string]uint {
v := uint(1)
m := make(map[string]uint)
for _, s := range ss {
_, ok := m[s]
if !ok {
m[s] = v
v++
}
}
return m
}

func TestStrMap(t *testing.T) {
ss := randStrings(20, 100000)
m := newStdStrMap(ss)
sm := New(m)
require.Equal(t, len(m), sm.Len())
for i, s := range ss {
v0 := m[s]
v1, _ := sm.Get(s)
require.Equal(t, v0, v1, i)
}
for i, s := range randStrings(20, 100000) {
v0, ok0 := m[s]
v1, ok1 := sm.Get(s)
require.Equal(t, ok0, ok1, i)
require.Equal(t, v0, v1, i)
}
m0 := make(map[string]uint)
for i := 0; i < sm.Len(); i++ {
s, v := sm.Item(i)
m0[s] = v
}
require.Equal(t, m, m0)
}

func TestStrMapString(t *testing.T) {
ss := []string{"a", "b", "c"}
m := newStdStrMap(ss)
sm := New(m)
t.Log(sm.String())
t.Log(sm.debugString())
}

func BenchmarkGet(b *testing.B) {
sizes := []int{20, 50, 100}
nn := []int{100000, 200000}

for _, n := range nn {
for _, sz := range sizes {
ss := randStrings(sz, n)
m := newStdStrMap(ss)
b.Run(fmt.Sprintf("std-keysize_%d_n_%d", sz, n), func(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = m[ss[i%len(ss)]]
}
})
b.Run(fmt.Sprintf("new-keysize_%d_n_%d", sz, n), func(b *testing.B) {
sm := New(m)
b.ResetTimer()
for i := 0; i < b.N; i++ {
sm.Get(ss[i%len(ss)])
}
})
}
}
}

func BenchmarkGC(b *testing.B) {
sizes := []int{20, 100}
nn := []int{100000, 400000}

for _, n := range nn {
for _, sz := range sizes {
ss := randStrings(sz, n)
m := newStdStrMap(ss)
b.Run(fmt.Sprintf("std-keysize_%d_n_%d", sz, n), func(b *testing.B) {
for i := 0; i < b.N; i++ {
runtime.GC()
}
})

sm := New(m)
m = nil
runtime.GC()

b.Run(fmt.Sprintf("new-keysize_%d_n_%d", sz, n), func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
runtime.GC()
}
})

_ = m // fix lint ineffassign of m = nil
runtime.KeepAlive(sm)
}
}
}
66 changes: 66 additions & 0 deletions container/strmap/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright 2024 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package strmap

import "math/bits"

var bits2primes = []int32{
0: 1, // 1
1: 7, // 2
2: 7, // 4
3: 17, // 8
4: 17, // 16
5: 31, // 32
6: 61, // 64
7: 127, // 128
8: 251, // 256
9: 509, // 512
10: 1021, // 1024
11: 2039, // 2048
12: 4093, // 4096
13: 8191, // 8192
14: 16381, // 16384
15: 32749, // 32768
16: 65521, // 65536
17: 131071, // 131072
18: 262139, // 262144
19: 524287, // 524288
20: 1048573, // 1048576
21: 2097143, // 2097152
22: 4194301, // 4194304
23: 8388593, // 8388608
24: 16777213, // 16777216
25: 33554393, // 33554432
26: 67108859, // 67108864
27: 134217689, // 134217728
28: 268435399, // 268435456
29: 536870909, // 536870912
30: 1073741789, // 1073741824
31: 2147483647, // 2147483648
}

const loadfactor = float64(0.75) // always < 1, then len(hashtable) > n

func calcHashtableSlots(n int) int32 {
// count bits to decide which prime number to use
bits := bits.Len64(uint64(float64(n) / loadfactor))
if bits >= len(bits2primes) {
// ???? are you sure we need to hold so many items? ~ 2B items for 31 bits
panic("too many items")
}
return bits2primes[bits] // a prime bigger than n
}
Loading

0 comments on commit b2fc661

Please sign in to comment.