对于已知的数据集,我们总是有办法找到一个Hash函数,将其映射到容量不少于数据集元素个数的表中。这种情况称为完美Hash。
BDZ算法的核心思想在于通过尝试不同的hash种子,找到一个满足多路Hash表的无冲突填充方式。
func (h *bdz) init(keys []string) bool {
//...
for trys := 0; trys < 8; trys++ {
h.seed = rand32()
if !graph.init(keys, h.hash) { //根据随机种子生成超图
return false
}
array.SetAll(bitmap[:ebmSize], 0)
free = graph.tear(free, bitmap) //拆解超图获得边序列
if len(free) != len(graph.edges) {
continue
}
array.SetAll(bitmap[:nbmSize], 0)
h.mapping(graph.edges, free, bitmap)
return true
}
return false
}
具体实现中通常采用三路Hash表,以表中槽位为点,元素为边,可以构成一种超图。
type vertex struct { //链表体
slot uint32
next uint32
}
type list struct { //链表头
size uint8
head uint32
}
type hypergraph struct {
edges [][3]vertex //超图的边,每条边三个顶点
nodes []list //超图的点,表现为链表
}
func (g *hypergraph) init(keys []string, hash func(string) [3]uint32) bool {
array.SetAll(g.nodes, list{0, endIdx})
for i := 0; i < len(keys); i++ {
slots := hash(keys[i])
for j := 0; j < 3; j++ {
v := &g.edges[i][j]
v.slot = slots[j]
n := &g.nodes[v.slot]
v.next = n.head
n.head = uint32(i)
n.size++
if n.size > 50 { //扎堆是不正常的
return false
} } }
return true
}
在拆解超图的过程中,保证每拆一条边能够切实解放至少一个点,也就意味着多路Hash表中某个元素可以获得槽位。
func (g *hypergraph) tear(free []uint32, bitmap []uint32) []uint32 {
free, head := free[:0], 0
for i := 0; i < len(g.edges); i++ {
for j := 0; j < 3; j++ {
n := g.nodes[g.edges[i][j].slot]
if n.size == 1 && testAndSetBit(bitmap, n.head) {
free = append(free, n.head)
break
} } }
for head < len(free) {
curr := free[head]
head++
for j := 0; j < 3; j++ {
v := &g.edges[curr][j]
n := &g.nodes[v.slot]
p := &n.head
for *p != curr {
p = &g.edges[*p][j].next
}
*p = v.next
v.next = endIdx
n.size--
if n.size == 1 && testAndSetBit(bitmap, n.head) {
free = append(free, n.head)
} } }
return free
}