From 871de6e9629f5bb4e71ca4f1c7073f26b07da78a Mon Sep 17 00:00:00 2001 From: "qiheng.zhou" Date: Mon, 16 Sep 2024 15:58:49 +0800 Subject: [PATCH] feat: add NewStrStoreFromSlice --- container/strstore/strstore.go | 60 ++++++++++++++++++++++------- container/strstore/strstore_test.go | 58 +++++++++++++++++++++------- 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/container/strstore/strstore.go b/container/strstore/strstore.go index b374637..1ab53eb 100644 --- a/container/strstore/strstore.go +++ b/container/strstore/strstore.go @@ -16,43 +16,76 @@ package strstore import ( "encoding/binary" + "math" "unsafe" ) const ( - pageSize = 1 << 20 - strlenSize = 4 + defaultPageSize = 1 << 20 + strlenSize = 4 // size of uint32, maximum 4GB for each string ) // StrStore is used to store string with less GC overhead. // The string stored here should not be longer than `pageSize` and does not need to be deleted. type StrStore struct { - pages [][]byte - offset int // offset of the latest page + pages [][]byte + pageSize int // size of each page + offset int // offset of the latest page } -// NewStrStore returns a StrStore. +// NewStrStore returns a StrStore with default page size. func NewStrStore() *StrStore { return &StrStore{ - pages: [][]byte{make([]byte, pageSize)}, + pages: [][]byte{make([]byte, defaultPageSize)}, + pageSize: defaultPageSize, } } +// NewStrStoreFromSlice constructs a StrStore with the input string slice and returns the StrStore and indexes for the following reads. +// It panics if any string in the slice is longer than math.MaxUint32. +func NewStrStoreFromSlice(ss []string) (*StrStore, []int) { + n := len(ss) + totalLen := strlenSize * n + for i := 0; i < n; i++ { + if len(ss[i]) > math.MaxUint32 { + panic("string too long") + } + totalLen += len(ss[i]) + } + idxes := make([]int, n) + st := &StrStore{ + pages: make([][]byte, 1), + pageSize: totalLen, + } + + page := make([]byte, totalLen) + offset := 0 + for i := 0; i < n; i++ { + idxes[i] = offset + binary.BigEndian.PutUint32(page[offset:offset+strlenSize], uint32(len(ss[i]))) + copy(page[offset+strlenSize:offset+strlenSize+len(ss[i])], ss[i]) + offset += strlenSize + len(ss[i]) + } + st.pages[0] = page + return st, idxes +} + // Set sets a string into the store and return an index. +// It returns -1 if the string is longer than pageSize or math.MaxUint32. func (s *StrStore) Set(str string) int { l := len(str) - if l > pageSize { + if l > s.pageSize || l > math.MaxUint32 { return -1 } totalLen := l + strlenSize - if len(s.pages) == 0 || s.offset+totalLen > pageSize { + if len(s.pages) == 0 || s.offset+totalLen > s.pageSize { // grow pages - s.pages = append(s.pages, make([]byte, pageSize)) + s.pages = append(s.pages, make([]byte, s.pageSize)) s.offset = 0 } offset := s.offset page := s.pages[len(s.pages)-1] - idx := (len(s.pages)-1)*pageSize + offset + idx := (len(s.pages)-1)*s.pageSize + offset // write length + string binary.BigEndian.PutUint32(page[offset:offset+strlenSize], uint32(l)) @@ -63,12 +96,13 @@ func (s *StrStore) Set(str string) int { } // Get gets the string with the idx. +// It returns empty string if the no string can be found with the input idx func (s *StrStore) Get(idx int) string { if idx < 0 { return "" } - pageIdx := idx / pageSize - offset := idx % pageSize + pageIdx := idx / s.pageSize + offset := idx % s.pageSize if pageIdx > len(s.pages)-1 || offset > len(s.pages[pageIdx]) { return "" } @@ -81,5 +115,5 @@ func (s *StrStore) Get(idx int) string { // Len returns the total length of bytes. func (s *StrStore) Len() int { - return len(s.pages) * pageSize + return len(s.pages) * s.pageSize } diff --git a/container/strstore/strstore_test.go b/container/strstore/strstore_test.go index 1d3f298..149385c 100644 --- a/container/strstore/strstore_test.go +++ b/container/strstore/strstore_test.go @@ -28,23 +28,57 @@ import ( func TestStrStore(t *testing.T) { // test when the pages grow strNum := 1000000 - strstore := NewStrStore() + strStore := NewStrStore() + for i := 0; i < strNum; i++ { + idx := strStore.Set(strconv.Itoa(i)) + assert.Equal(t, strconv.Itoa(i), strStore.Get(idx), fmt.Sprintf("i=%d, idx=%d", i, idx)) + } + s := strStore.Get(-1) + assert.Equal(t, "", s) + s = strStore.Get(strStore.Len() * 2) + assert.Equal(t, "", s) + + // batch construct + ss := make([]string, strNum) + for i := 0; i < strNum; i++ { + ss[i] = strconv.Itoa(i) + } + bs, idxes := NewStrStoreFromSlice(ss) for i := 0; i < strNum; i++ { - idx := strstore.Set(strconv.Itoa(i)) - assert.Equal(t, strconv.Itoa(i), strstore.Get(idx), fmt.Sprintf("i=%d, idx=%d", i, idx)) + assert.Equal(t, strconv.Itoa(i), bs.Get(idxes[i])) } - s := strstore.Get(-1) + s = bs.Get(-1) assert.Equal(t, "", s) - s = strstore.Get(strstore.Len() * 2) + s = bs.Get(strStore.Len() * 2) assert.Equal(t, "", s) + // set more strings to this StrStore + for i := 0; i < strNum; i++ { + idx := bs.Set(strconv.Itoa(i)) + assert.Equal(t, strconv.Itoa(i), bs.Get(idx), fmt.Sprintf("i=%d, idx=%d", i, idx)) + } } func BenchmarkStrStoreGetSet(b *testing.B) { mockStr := "0123456789" avgStrlen := len(mockStr) - pageLen := pageSize / avgStrlen + pageLen := defaultPageSize / avgStrlen strSlice := newStrSliceStore(pageLen) - strstore := &StrStore{} + strstore := NewStrStore() + + b.Run("strbuf-set", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + strstore.Set(mockStr) + } + }) + b.Run("strbuf-get", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + strstore.Get(0) + } + }) b.Run("strbuf-set", func(b *testing.B) { b.ReportAllocs() @@ -79,10 +113,7 @@ func BenchmarkStrStoreGetSet(b *testing.B) { func BenchmarkStrStoreGC(b *testing.B) { ss := randStrings(50, 1000000) - strstore := NewStrStore() - for i := 0; i < len(ss); i++ { - strstore.Set(ss[i]) - } + strStore, idxes := NewStrStoreFromSlice(ss) _ = ss runtime.GC() b.ResetTimer() @@ -90,7 +121,8 @@ func BenchmarkStrStoreGC(b *testing.B) { for i := 0; i < b.N; i++ { runtime.GC() } - runtime.KeepAlive(strstore) + runtime.KeepAlive(strStore) + runtime.KeepAlive(idxes) } func BenchmarkStdStrSliceGC(b *testing.B) { @@ -122,7 +154,7 @@ type strSliceStore struct { } func (s *strSliceStore) Set(str string) int { - if len(s.ss) == 0 || len(s.ss[len(s.ss)-1]) == pageSize-1 { + if len(s.ss) == 0 || len(s.ss[len(s.ss)-1]) == defaultPageSize-1 { s.ss = append(s.ss, make([]string, 0, s.pageLen)) } pageIdx := len(s.ss) - 1