Skip to content

Commit

Permalink
feat: add from_file API
Browse files Browse the repository at this point in the history
  • Loading branch information
yankun1992 committed Dec 4, 2023
1 parent 8c773ed commit 3d57d80
Show file tree
Hide file tree
Showing 10 changed files with 265 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom_rs"
version = "0.5.8"
version = "0.5.9"
edition = "2021"
authors = ["Yan Kun <[email protected]>"]
description = "Some fast bloom filter implemented by Rust for Python and Rust! 10x faster than pybloom!"
Expand Down
2 changes: 1 addition & 1 deletion build.sc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ object ProjectInfo {

def author = Seq("Yan Kun <[email protected]>")

def version = "0.5.8"
def version = "0.5.9"

def buildTool = "mill"

Expand Down
2 changes: 1 addition & 1 deletion fastbloom-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom-rs"
version = "0.5.8"
version = "0.5.9"
edition = "2021"
authors = ["Yan Kun <[email protected]>"]
description = "Some fast bloom filter implemented by Rust for Python and Rust!"
Expand Down
113 changes: 111 additions & 2 deletions fastbloom-rs/src/bloom.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::clone;
use std::cmp::min;
use std::fs::{File, OpenOptions};
use std::io::{Write, Read};
use std::ptr::slice_from_raw_parts;

use fastmurmur3::murmur3_x64_128;
use xxhash_rust::xxh3::xxh3_64_with_seed;

use crate::{Deletable, Hashes, Membership};
Expand Down Expand Up @@ -161,6 +161,37 @@ impl BloomFilter {
self.config.hashes as u64)
}

/// Build a Bloom filter from file with first four bytes is hashes which is encode by big-endian.
/// The remaining is underlying byte vector of the Bloom filter.
pub fn from_file_with_hashes(path: &str) -> Self {
let mut f = File::open(path).unwrap();
let len = f.metadata().unwrap().len() - 4;
let mut hash = [0; 4];
f.read_exact(&mut hash).unwrap();
let hashes = u32::from_be_bytes(hash);

let mut config =
FilterBuilder::from_size_and_hashes((len * 8) as u64, hashes);
config.complete();

let bit_set = BloomBitVec::from_file(&mut f, 4, len);

BloomFilter { config, bit_set }
}

/// Build a Bloom filter from file. The content is underlying byte vector of the Bloom filter.
pub fn from_file(path: &str, hashes: u32) -> Self {
let mut f = File::open(path).unwrap();
let len = f.metadata().unwrap().len();
let mut config =
FilterBuilder::from_size_and_hashes((len * 8) as u64, hashes);
config.complete();

let bit_set = BloomBitVec::from_file(&mut f, 0, len);

BloomFilter { config, bit_set }
}

/// Build a Bloom filter form `&[u8]`.
///
/// # Examples
Expand Down Expand Up @@ -292,6 +323,26 @@ impl BloomFilter {
self.config.clone()
}

/// Save the bloom filter to file, and the first four bytes is hashes with
/// big-endian, and the remaining bytes is underlying byte vector of the Bloom filter.
pub fn save_to_file_with_hashes(&mut self, path: &str) {
let mut file = File::create(path).unwrap();
let hash = self.hashes().to_be_bytes();
file.write_all(&hash).unwrap();

let bytes = self.get_u8_array();
let mut file = OpenOptions::new().append(true).open(path).unwrap();
file.write_all(bytes).unwrap();
}

/// Save the bloom filter to file, and the content of the file is underlying byte
/// vector of the Bloom filter.
pub fn save_to_file(&mut self, path: &str) {
let mut file = File::create(path).unwrap();
let bytes = self.get_u8_array();
file.write_all(bytes).unwrap();
}

/// Return the underlying byte vector of the Bloom filter.
pub fn get_u8_array(&self) -> &[u8] {
let storage = &self.bit_set.storage;
Expand Down Expand Up @@ -761,6 +812,64 @@ fn bloom_hash_indices_test() {
assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"world")), false);
}

#[test]
fn bloom_large() {
let mut builder =
FilterBuilder::new(1_000_000_000, 0.0001);
let mut bloom = builder.build_bloom_filter();

bloom.add(b"hello");
assert_eq!(bloom.contains(b"hello"), true);

let bloom = BloomFilter::from_u8_array(bloom.get_u8_array(), bloom.hashes());

assert_eq!(bloom.contains(b"hello"), true);

}

#[test]
fn bloom_save_and_load_file_hashes() {
{
let mut builder = FilterBuilder::new(1_000_000_000, 0.0001);
let mut bloom = builder.build_bloom_filter();

bloom.add(b"hello");
assert_eq!(bloom.contains(b"hello"), true);
bloom.save_to_file_with_hashes("hello.bloom");
}


let bloom2 = BloomFilter::from_file_with_hashes("hello.bloom");
fs::remove_file("hello.bloom").unwrap();

assert_eq!(bloom2.contains(b"hello"), true);
assert_eq!(bloom2.contains(b"world"), false);

}

#[test]
fn bloom_save_and_load_file() {
let mut hashes = 0;
{
let mut builder = FilterBuilder::new(1_000_000_000, 0.0001);
let mut bloom = builder.build_bloom_filter();

bloom.add(b"hello");
assert_eq!(bloom.contains(b"hello"), true);

hashes = bloom.hashes();

bloom.save_to_file("no_hashes.bloom");
}

let bloom2 = BloomFilter::from_file("no_hashes.bloom", hashes);
fs::remove_file("no_hashes.bloom").unwrap();

assert_eq!(bloom2.contains(b"hello"), true);
assert_eq!(bloom2.contains(b"world"), false);

}


#[test]
fn counting_bloom_test() {
Expand Down
35 changes: 34 additions & 1 deletion fastbloom-rs/src/vec.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use core::mem::size_of;
use core::slice;
use std::{fs::File, os::windows::fs::FileExt};

use crate::builder::SUFFIX;

Expand All @@ -25,12 +26,44 @@ impl BloomBitVec {
nbits: (slots * get_usize_len()) as u64,
}
}

pub fn from_elem(slots: usize, bit: bool) -> Self {
BloomBitVec {
storage: vec![if bit { !0 } else { 0 }; slots],
nbits: (slots * get_usize_len()) as u64,
}
}

pub fn from_file(file: &mut File, seek: u64, bytes_len: u64) -> Self {
#[cfg(target_pointer_width = "64")]
let length = bytes_len / 8;
#[cfg(target_pointer_width = "32")]
let length = bytes_len / 4;

let nbits = bytes_len * 8;

let mut storage = vec![0usize; length.try_into().unwrap()];
let ptr = storage.as_mut_ptr();
let buf = ptr as *mut u8;
let buf = unsafe {
slice::from_raw_parts_mut(buf, bytes_len.try_into().unwrap())
};

let mut cursor = seek;
let mut read = 0u64;

while read < bytes_len {
let size = file.seek_read(&mut buf[read as usize ..], cursor).unwrap() as u64;
read = read + size;
cursor += size;
}


BloomBitVec {
storage,
nbits: nbits.try_into().unwrap()
}
}

#[inline]
pub fn set(&mut self, index: usize) {
Expand Down
14 changes: 14 additions & 0 deletions fastbloom_rs/fastbloom_rs.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ class PyBloomFilter(object):
def get_int_array(self) -> Sequence[int]:
...

def save_to_file_with_hashes(self, path: str):
...

def save_to_file(self, path: str):
...

def clear(self):
...

Expand Down Expand Up @@ -128,6 +134,14 @@ class PyBloomFilter(object):
def from_int_array(array: Sequence[int], hashes: int) -> PyBloomFilter:
...

@staticmethod
def from_file_with_hashes(path: str) -> PyBloomFilter:
...

@staticmethod
def from_file(path: str, hashes: int) -> PyBloomFilter:
...


class PyCountingBloomFilter(object):
def add(self, element: Union[str, int, bytes]):
Expand Down
30 changes: 30 additions & 0 deletions fastbloom_rs/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,20 @@ def get_int_array(self) -> Sequence[int]:
"""
return self._py_bloom.get_int_array()

def save_to_file_with_hashes(self, path: str):
"""
Save the bloom filter to file, and the first four bytes is hashes with
big-endian, and the remaining bytes is underlying byte vector of the Bloom filter.
"""
return self._py_bloom.save_to_file_with_hashes(path)

def save_to_file(self, path: str):
"""
Save the bloom filter to file, and the content of the file is underlying byte
vector of the Bloom filter.
"""
return self._py_bloom.save_to_file(path)

def clear(self):
"""
Removes all elements from the filter (i.e. resets all bits to zero).
Expand Down Expand Up @@ -388,6 +402,22 @@ def from_int_array(array: Sequence[int], hashes: int) -> "BloomFilter":
py_bloom = PyBloomFilter.from_int_array(array, hashes)
return BloomFilter(py_bloom)

@staticmethod
def from_file_with_hashes(path: str) -> PyBloomFilter:
"""
Build a Bloom filter from file with first four bytes is hashes which is encode by big-endian.
The remaining is underlying byte vector of the Bloom filter.
"""
py_bloom = PyBloomFilter.from_file_with_hashes(path)
return BloomFilter(py_bloom)

@staticmethod
def from_file(path: str, hashes: int) -> PyBloomFilter:
"""
Build a Bloom filter from file. The content is underlying byte vector of the Bloom filter.
"""
py_bloom = PyBloomFilter.from_file(path, hashes)
return BloomFilter(py_bloom)

class CountingBloomFilter(object):
"""
Expand Down
2 changes: 1 addition & 1 deletion fastbloomjvm/native/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom" # generated by nativeInit with defaultNativeName
version = "0.5.8"
version = "0.5.9"
authors = ["Yan Kun <[email protected]>"]
edition = "2021"

Expand Down
60 changes: 54 additions & 6 deletions py_tests/test_bloom.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-

from fastbloom_rs import BloomFilter, FilterBuilder
import os


def test_bloom_builder():
Expand Down Expand Up @@ -66,8 +67,9 @@ def test_bloom_estimate_set_cardinality():
bloom = BloomFilter(100_000_000, 0.01)
for data in range(0, 10_000_000):
bloom.add_int(data)

assert (bloom.estimate_set_cardinality() < 10_100_000) and (bloom.estimate_set_cardinality() > 9_900_000)

assert (bloom.estimate_set_cardinality() < 10_100_000) and (
bloom.estimate_set_cardinality() > 9_900_000)


def test_bloom_op():
Expand Down Expand Up @@ -125,7 +127,8 @@ def test_hash_indices():
bloom2 = BloomFilter(100_000_000, 0.01)
bloom2.add_str("Yan Kun")

assert bloom.get_hash_indices(b'hello') == bloom2.get_hash_indices(b'hello')
assert bloom.get_hash_indices(
b'hello') == bloom2.get_hash_indices(b'hello')

assert bloom.contains_hash_indices(bloom.get_hash_indices(b'hello'))
assert bloom.contains_hash_indices(bloom.get_hash_indices(87))
Expand All @@ -142,14 +145,59 @@ def test_batch_check():
bloom = BloomFilter(100_000_000, 0.01)
inserts = [1, 2, 3, 4, 5, 6, 7, 9, 18, 68, 90, 100]
checks = [1, 2, 3, 4, 5, 6, 7, 9, 18, 68, 90, 100, 190, 290, 390]
results = [True, True, True, True, True, True, True, True, True, True, True, True, False, False, False]
results = [True, True, True, True, True, True, True,
True, True, True, True, True, False, False, False]

bloom.add_int_batch(inserts)
contains = bloom.contains_int_batch(checks)
assert contains == results

bloom.add_str_batch(list(map(lambda x: str(x), inserts)))
assert bloom.contains_str_batch(list(map(lambda x: str(x), checks))) == results
assert bloom.contains_str_batch(
list(map(lambda x: str(x), checks))) == results

bloom.add_bytes_batch(list(map(lambda x: bytes(x), inserts)))
assert bloom.contains_bytes_batch(list(map(lambda x: bytes(x), checks))) == results
assert bloom.contains_bytes_batch(
list(map(lambda x: bytes(x), checks))) == results


def test_save_load_hashes_file():
bloom = BloomFilter(1_000_000_000, 0.0001)
bloom.add_bytes(b'hello')
bloom.add(87)

assert bloom.contains_bytes(b'hello')
assert bloom.contains_int(87)
assert not bloom.contains('world')

bloom.save_to_file_with_hashes('fst.bloom')
del bloom # gc

bloom = BloomFilter.from_file_with_hashes('fst.bloom')
assert bloom.contains_bytes(b'hello')
assert bloom.contains_int(87)
assert not bloom.contains('world')

os.remove('fst.bloom')

def test_save_load_file():
hashes = 0
bloom = BloomFilter(1_000_000_000, 0.01)
bloom.add_bytes(b'hello')
bloom.add(87)

hashes = bloom.hashes()

assert bloom.contains_bytes(b'hello')
assert bloom.contains_int(87)
assert not bloom.contains('world')

bloom.save_to_file('fst.bloom')
del bloom # gc

bloom = BloomFilter.from_file('fst.bloom', hashes)
assert bloom.contains_bytes(b'hello')
assert bloom.contains_int(87)
assert not bloom.contains('world')

os.remove('fst.bloom')
Loading

0 comments on commit 3d57d80

Please sign in to comment.