From 67b726fe1c21b5e5cf218fb7af54695005199c37 Mon Sep 17 00:00:00 2001 From: Kyle Xiao Date: Mon, 16 Sep 2024 14:29:39 +0800 Subject: [PATCH] feat(strmap): new pkg for optimizing GC issues Usage: ``` m := make(map[string]bool) // populate m // ... sm := strmap.New(m) m = nil // no need the original map v, ok := sm.Get(key) // it replaces v, ok := m[key] ``` Benchmark: ``` BenchmarkGC/std-keysize_20_n_100000-12 126 950226 ns/op BenchmarkGC/new-keysize_20_n_100000-12 1069 110980 ns/op BenchmarkGC/std-keysize_100_n_100000-12 145 888562 ns/op BenchmarkGC/new-keysize_100_n_100000-12 1023 112419 ns/op BenchmarkGC/std-keysize_20_n_400000-12 87 2935875 ns/op BenchmarkGC/new-keysize_20_n_400000-12 1032 112177 ns/op BenchmarkGC/std-keysize_100_n_400000-12 46 2813522 ns/op BenchmarkGC/new-keysize_100_n_400000-12 1054 110864 ns/op ``` This change also deprecated xfnv which is an experimental impl, and it's not good enough in terms of distribution --- container/strmap/strmap.go | 168 +++++++++++++++++++++++++ container/strmap/strmap_test.go | 145 +++++++++++++++++++++ container/strmap/utils.go | 66 ++++++++++ container/strmap/utils_test.go | 30 +++++ hash/xfnv/xfnv.go | 73 ----------- hash/xfnv/xfnv_test.go | 80 ------------ internal/hash/maphash/maphash.go | 39 ++++++ internal/hash/maphash/maphash_go118.go | 45 +++++++ 8 files changed, 493 insertions(+), 153 deletions(-) create mode 100644 container/strmap/strmap.go create mode 100644 container/strmap/strmap_test.go create mode 100644 container/strmap/utils.go create mode 100644 container/strmap/utils_test.go delete mode 100644 hash/xfnv/xfnv.go delete mode 100644 hash/xfnv/xfnv_test.go create mode 100644 internal/hash/maphash/maphash.go create mode 100644 internal/hash/maphash/maphash_go118.go diff --git a/container/strmap/strmap.go b/container/strmap/strmap.go new file mode 100644 index 0000000..044f845 --- /dev/null +++ b/container/strmap/strmap.go @@ -0,0 +1,168 @@ +/* + * Copyright 2024 CloudWeGo Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package strmap + +import ( + "fmt" + "math" + "sort" + "strings" + + "github.com/cloudwego/gopkg/internal/hack" + "github.com/cloudwego/gopkg/internal/hash/maphash" +) + +// StrMap represents GC friendly readonly string map implementation. +// type V must NOT contain pointer for performance concern. +type StrMap[V any] struct { + + // `data` holds bytes of keys + data []byte + + // `items` holds key meta + items []mapItem[V] + + // max hashtable ~ 2 billions which means len(items) < the num as well. + hashtable []int32 // using int32 for mem efficiency + + // for maphash + seed maphash.Seed +} + +type mapItem[V any] struct { + off int + sz uint32 // 4GB, big enough for key + slot uint32 + v V +} + +// New creates StrMap from map[string]V +func New[V any](m map[string]V) *StrMap[V] { + sz := 0 + for k, _ := range m { + sz += len(k) + } + b := make([]byte, 0, sz) + + seed := maphash.MakeSeed() + items := make([]mapItem[V], 0, len(m)) + for k, v := range m { + if len(k) > math.MaxUint32 { + // it doesn't make sense ... + panic("key too large") + } + items = append(items, mapItem[V]{off: len(b), sz: uint32(len(k)), slot: uint32(maphash.String(seed, k)), v: v}) + b = append(b, k...) + } + + ret := &StrMap[V]{data: b, items: items, seed: seed} + ret.makeHashtable() + return ret +} + +// Len returns the size of map +func (m *StrMap[V]) Len() int { + return len(m.items) +} + +// Item returns the i'th item in map. +// It panics if i is not in the range [0, Len()). +func (m *StrMap[V]) Item(i int) (string, V) { + e := &m.items[i] + return hack.ByteSliceToString(m.data[e.off : e.off+int(e.sz)]), e.v +} + +type itemsBySlot[V any] []mapItem[V] + +func (x itemsBySlot[V]) Len() int { return len(x) } +func (x itemsBySlot[V]) Less(i, j int) bool { return x[i].slot < x[j].slot } +func (x itemsBySlot[V]) Swap(i, j int) { x[i], x[j] = x[j], x[i] } + +func (m *StrMap[V]) makeHashtable() { + slots := calcHashtableSlots(len(m.items)) + m.hashtable = make([]int32, slots) + + // update `slot` of mapItem to fit the size of hashtable + for i := range m.items { + m.items[i].slot = m.items[i].slot % uint32(slots) + } + + // make sure items with the same slot stored together + // good for cpu cache + sort.Sort(itemsBySlot[V](m.items)) + + for i := 0; i < len(m.hashtable); i++ { + m.hashtable[i] = -1 + } + for i := range m.items { + e := &m.items[i] + if m.hashtable[e.slot] < 0 { + // we only need to store the 1st item if hash conflict + // since they're already stored together + // will check the next item when Get + m.hashtable[e.slot] = int32(i) + } + } +} + +// Get ... +func (m *StrMap[V]) Get(s string) (t V, ok bool) { + slot := uint32(maphash.String(m.seed, s)) % uint32(len(m.hashtable)) + i := m.hashtable[slot] + if i < 0 { + return t, false + } + e := &m.items[i] + if string(m.data[e.off:e.off+int(e.sz)]) == s { + return e.v, true + } + + // collision, worst O(n) + // coz i always point to the 1st item with the same slot, + // can scan till m.items ends or e.slot != slot. + for j := i + 1; j < int32(len(m.items)); j++ { + e = &m.items[j] + if e.slot != slot { + break + } + if string(m.data[e.off:e.off+int(e.sz)]) == s { + return e.v, true + } + } + return t, false +} + +// String ... +func (m *StrMap[V]) String() string { + b := &strings.Builder{} + b.WriteString("{\n") + for _, e := range m.items { + fmt.Fprintf(b, "%q: %v,\n", string(m.data[e.off:e.off+int(e.sz)]), e.v) + } + b.WriteString("}") + return b.String() +} + +func (m *StrMap[V]) debugString() string { + b := &strings.Builder{} + b.WriteString("{\n") + for _, e := range m.items { + fmt.Fprintf(b, "{off:%d, slot:%x, str:%q, v:%v},\n", e.off, e.slot, string(m.data[e.off:e.off+int(e.sz)]), e.v) + } + fmt.Fprintf(b, "}(slots=%d, items=%d)", len(m.hashtable), len(m.items)) + return b.String() +} diff --git a/container/strmap/strmap_test.go b/container/strmap/strmap_test.go new file mode 100644 index 0000000..7bfc836 --- /dev/null +++ b/container/strmap/strmap_test.go @@ -0,0 +1,145 @@ +/* + * Copyright 2024 CloudWeGo Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package strmap + +import ( + "crypto/rand" + "fmt" + "runtime" + "testing" + + "github.com/cloudwego/gopkg/internal/hack" + "github.com/stretchr/testify/require" +) + +func randString(m int) string { + b := make([]byte, m) + rand.Read(b) + return string(b) +} + +func randStrings(m, n int) []string { + b := make([]byte, m*n) + rand.Read(b) + ret := make([]string, 0, n) + for i := 0; i < n; i++ { + s := b[m*i:] + s = s[:m] + ret = append(ret, hack.ByteSliceToString(s)) + } + return ret +} + +// newStdStrMap generates a map with uniq values +func newStdStrMap(ss []string) map[string]uint { + v := uint(1) + m := make(map[string]uint) + for _, s := range ss { + _, ok := m[s] + if !ok { + m[s] = v + v++ + } + } + return m +} + +func TestStrMap(t *testing.T) { + ss := randStrings(20, 100000) + m := newStdStrMap(ss) + sm := New(m) + require.Equal(t, len(m), sm.Len()) + for i, s := range ss { + v0 := m[s] + v1, _ := sm.Get(s) + require.Equal(t, v0, v1, i) + } + for i, s := range randStrings(20, 100000) { + v0, ok0 := m[s] + v1, ok1 := sm.Get(s) + require.Equal(t, ok0, ok1, i) + require.Equal(t, v0, v1, i) + } + m0 := make(map[string]uint) + for i := 0; i < sm.Len(); i++ { + s, v := sm.Item(i) + m0[s] = v + } + require.Equal(t, m, m0) +} + +func TestStrMapString(t *testing.T) { + ss := []string{"a", "b", "c"} + m := newStdStrMap(ss) + sm := New(m) + t.Log(sm.String()) + t.Log(sm.debugString()) +} + +func BenchmarkGet(b *testing.B) { + sizes := []int{20, 50, 100} + nn := []int{100000, 200000} + + for _, n := range nn { + for _, sz := range sizes { + ss := randStrings(sz, n) + m := newStdStrMap(ss) + b.Run(fmt.Sprintf("std-keysize_%d_n_%d", sz, n), func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = m[ss[i%len(ss)]] + } + }) + b.Run(fmt.Sprintf("new-keysize_%d_n_%d", sz, n), func(b *testing.B) { + sm := New(m) + b.ResetTimer() + for i := 0; i < b.N; i++ { + sm.Get(ss[i%len(ss)]) + } + }) + } + } +} + +func BenchmarkGC(b *testing.B) { + sizes := []int{20, 100} + nn := []int{100000, 400000} + + for _, n := range nn { + for _, sz := range sizes { + ss := randStrings(sz, n) + m := newStdStrMap(ss) + b.Run(fmt.Sprintf("std-keysize_%d_n_%d", sz, n), func(b *testing.B) { + for i := 0; i < b.N; i++ { + runtime.GC() + } + }) + + sm := New(m) + m = nil + runtime.GC() + + b.Run(fmt.Sprintf("new-keysize_%d_n_%d", sz, n), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + runtime.GC() + } + }) + + runtime.KeepAlive(sm) + } + } +} diff --git a/container/strmap/utils.go b/container/strmap/utils.go new file mode 100644 index 0000000..90f1921 --- /dev/null +++ b/container/strmap/utils.go @@ -0,0 +1,66 @@ +/* + * Copyright 2024 CloudWeGo Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package strmap + +import "math/bits" + +var bits2primes = []int32{ + 0: 1, // 1 + 1: 7, // 2 + 2: 7, // 4 + 3: 17, // 8 + 4: 17, // 16 + 5: 31, // 32 + 6: 61, // 64 + 7: 127, // 128 + 8: 251, // 256 + 9: 509, // 512 + 10: 1021, // 1024 + 11: 2039, // 2048 + 12: 4093, // 4096 + 13: 8191, // 8192 + 14: 16381, // 16384 + 15: 32749, // 32768 + 16: 65521, // 65536 + 17: 131071, // 131072 + 18: 262139, // 262144 + 19: 524287, // 524288 + 20: 1048573, // 1048576 + 21: 2097143, // 2097152 + 22: 4194301, // 4194304 + 23: 8388593, // 8388608 + 24: 16777213, // 16777216 + 25: 33554393, // 33554432 + 26: 67108859, // 67108864 + 27: 134217689, // 134217728 + 28: 268435399, // 268435456 + 29: 536870909, // 536870912 + 30: 1073741789, // 1073741824 + 31: 2147483647, // 2147483648 +} + +const loadfactor = float64(0.75) // always < 1, then len(hashtable) > n + +func calcHashtableSlots(n int) int32 { + // count bits to decide which prime number to use + bits := bits.Len64(uint64(float64(n) / loadfactor)) + if bits >= len(bits2primes) { + // ???? are you sure we need to hold so many items? ~ 2B items for 31 bits + panic("too many items") + } + return bits2primes[bits] // a prime bigger than n +} diff --git a/container/strmap/utils_test.go b/container/strmap/utils_test.go new file mode 100644 index 0000000..5347726 --- /dev/null +++ b/container/strmap/utils_test.go @@ -0,0 +1,30 @@ +/* + * Copyright 2024 CloudWeGo Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package strmap + +import ( + "testing" +) + +func TestCalcHashtableSlots(t *testing.T) { + n := 0 + for n < 2147483647 { + m := calcHashtableSlots(n) + t.Log(n, m) + n = int(m) + 1 + } +} diff --git a/hash/xfnv/xfnv.go b/hash/xfnv/xfnv.go deleted file mode 100644 index f3d1d33..0000000 --- a/hash/xfnv/xfnv.go +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2024 CloudWeGo Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package xfnv is modified and non-cross-platform compatible version of FNV-1a. -// -// It computes 8 bytes per round by converting bytes to uint64 directly -// as a result it doesn't generate the same result for diff cpu arch. -package xfnv - -import ( - "unsafe" -) - -const ( - fnvHashOffset64 = uint64(14695981039346656037) // fnv hash offset64 - fnvHashPrime64 = uint64(1099511628211) -) - -func strDataPtr(s string) unsafe.Pointer { - // for str, the Data ptr is always the 1st field - return *(*unsafe.Pointer)(unsafe.Pointer(&s)) -} - -func bytesDataPtr(b []byte) unsafe.Pointer { - // for []byte, the Data ptr is always the 1st field - return *(*unsafe.Pointer)(unsafe.Pointer(&b)) -} - -// Hash returns the hash of the given bytes -// -// DO NOT STORE the return value since it's NOT cross-platform compatible. -// It's designed for in-memory use. -func Hash(b []byte) uint64 { - return doHash(bytesDataPtr(b), len(b)) -} - -// HashStr returns the hash of the given string -// -// DO NOT STORE the return value since it's NOT cross-platform compatible. -// It's designed for in-memory use. -func HashStr(s string) uint64 { - return doHash(strDataPtr(s), len(s)) -} - -func doHash(p unsafe.Pointer, n int) uint64 { - h := fnvHashOffset64 - i := 0 - // 8 byte per round - for m := n >> 3; i < m; i++ { - h ^= *(*uint64)(unsafe.Add(p, i<<3)) // p[i*8] - h *= fnvHashPrime64 - } - // left 0-7 bytes - i = i << 3 - for ; i < n; i++ { - h ^= uint64(*(*byte)(unsafe.Add(p, i))) - h *= fnvHashPrime64 - } - return h -} diff --git a/hash/xfnv/xfnv_test.go b/hash/xfnv/xfnv_test.go deleted file mode 100644 index 31cb176..0000000 --- a/hash/xfnv/xfnv_test.go +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2024 CloudWeGo Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package xfnv - -import ( - "crypto/rand" - "fmt" - "hash/maphash" - "testing" - - "github.com/bytedance/gopkg/util/xxhash3" - "github.com/stretchr/testify/require" -) - -func TestHashStr(t *testing.T) { - require.Equal(t, HashStr("1234"), HashStr("1234")) - require.NotEqual(t, HashStr("12345"), HashStr("12346")) - require.Equal(t, HashStr("12345678"), HashStr("12345678")) - require.NotEqual(t, HashStr("123456789"), HashStr("123456788")) -} - -func BenchmarkHash(b *testing.B) { - sizes := []int{8, 16, 32, 64, 128, 512} - bb := make([][]byte, len(sizes)) - for i := range bb { - b := make([]byte, sizes[i]) - rand.Read(b) - bb[i] = b - } - b.ResetTimer() - for _, data := range bb { - b.Run(fmt.Sprintf("size-%d-xfnv", len(data)), func(b *testing.B) { - b.SetBytes(int64(len(data))) - for i := 0; i < b.N; i++ { - _ = Hash(data) - } - }) - } - - println("") - - for _, data := range bb { - b.Run(fmt.Sprintf("size-%d-xxhash3", len(data)), func(b *testing.B) { - b.SetBytes(int64(len(data))) - for i := 0; i < b.N; i++ { - _ = xxhash3.Hash(data) - } - }) - } - - println("") - - for _, data := range bb { - b.Run(fmt.Sprintf("size-%d-maphash", len(data)), func(b *testing.B) { - s := maphash.MakeSeed() - h := &maphash.Hash{} - h.SetSeed(s) - b.SetBytes(int64(len(data))) - for i := 0; i < b.N; i++ { - // use maphash.Bytes which is more fair to benchmark after go1.19 - // maphash.Bytes(s, data) - _, _ = h.Write(data) - } - }) - } -} diff --git a/internal/hash/maphash/maphash.go b/internal/hash/maphash/maphash.go new file mode 100644 index 0000000..b9957e2 --- /dev/null +++ b/internal/hash/maphash/maphash.go @@ -0,0 +1,39 @@ +/* + * Copyright 2024 CloudWeGo Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:build go1.19 + +// maphash wraps std hash/maphash for working with go1.18 which doesn't has Bytes, String functions +// TODO: use hash/maphash directly if we no longer support go1.18 +package maphash + +import "hash/maphash" + +// Seed ... +type Seed = maphash.Seed + +// MakeSeed ... +func MakeSeed() maphash.Seed { return maphash.MakeSeed() } + +// Bytes ... +func Bytes(seed maphash.Seed, b []byte) uint64 { + return maphash.Bytes(seed, b) +} + +// String ... +func String(seed maphash.Seed, s string) uint64 { + return maphash.String(seed, s) +} diff --git a/internal/hash/maphash/maphash_go118.go b/internal/hash/maphash/maphash_go118.go new file mode 100644 index 0000000..9b721ff --- /dev/null +++ b/internal/hash/maphash/maphash_go118.go @@ -0,0 +1,45 @@ +/* + * Copyright 2024 CloudWeGo Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:build !go1.19 + +// maphash wraps std hash/maphash for working with go1.18 which doesn't has Bytes, String functions +// TODO: use hash/maphash directly if we no longer support go1.18 +package maphash + +import ( + "hash/maphash" + + "github.com/bytedance/gopkg/util/xxhash3" +) + +// Seed ... +type Seed = maphash.Seed + +// MakeSeed ... +func MakeSeed() maphash.Seed { return maphash.MakeSeed() } + +// Bytes ... +func Bytes(_ maphash.Seed, b []byte) uint64 { + // use xxhash3 since maphash.Bytes not available + return xxhash3.Hash(b) +} + +// String ... +func String(_ maphash.Seed, s string) uint64 { + // use xxhash3 since maphash.String not available + return xxhash3.HashString(s) +}