Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code movement, documentation, heap size #10

Merged
merged 4 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use flatcontainer::impls::deduplicate::{CollapseSequence, ConsecutiveOffsetPairs
use flatcontainer::impls::offsets::OffsetOptimized;
use flatcontainer::impls::tuple::{TupleABCRegion, TupleABRegion};
use flatcontainer::{
Containerized, CopyOnto, CopyRegion, FlatStack, MirrorRegion, Region, ReserveItems,
SliceRegion, StringRegion,
ColumnsRegion, Containerized, CopyOnto, CopyRegion, FlatStack, MirrorRegion, Region,
ReserveItems, SliceRegion, StringRegion,
};
use test::Bencher;

Expand Down Expand Up @@ -115,6 +115,25 @@ fn vec_u_vn_s_copy_region(bencher: &mut Bencher) {
vec![vec![(0u64, vec![(); 1 << 40], "grawwwwrr!".to_string()); 32]; 32],
);
}
#[bench]
fn vec_u_vn_s_copy_region_column(bencher: &mut Bencher) {
_bench_copy_region::<
SliceRegion<
ColumnsRegion<
TupleABCRegion<
MirrorRegion<_>,
CollapseSequence<CopyRegion<_>>,
CollapseSequence<StringRegion>,
>,
_,
>,
>,
_,
>(
bencher,
vec![vec![(0u64, vec![(); 1 << 40], "grawwwwrr!".to_string()); 32]; 32],
);
}

#[bench]
fn empty_clone(bencher: &mut Bencher) {
Expand Down Expand Up @@ -258,6 +277,12 @@ where
arena.copy(&record);
}
});
let (mut siz, mut cap) = (0, 0);
arena.heap_size(|this_siz, this_cap| {
siz += this_siz;
cap += this_cap
});
println!("{siz} {cap}");
}

fn _bench_copy_region<R: Region, T>(bencher: &mut Bencher, record: T)
Expand All @@ -273,6 +298,12 @@ where
arena.copy(&record);
}
});
let (mut siz, mut cap) = (0, 0);
arena.heap_size(|this_siz, this_cap| {
siz += this_siz;
cap += this_cap
});
println!("{siz} {cap}");
}

fn _bench_clone<T: Containerized + Eq + Clone>(bencher: &mut Bencher, record: T) {
Expand Down
51 changes: 32 additions & 19 deletions src/codec.rs → src/impls/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub use dictionary::DictionaryCodec;
/// This method will sort `vec` and then consolidate runs of more than one entry with
/// identical first elements by accumulating the second elements of the pairs. Should the final
/// accumulation be zero, the element is discarded.
pub fn consolidate<T: Ord>(vec: &mut Vec<(T, usize)>) {
fn consolidate<T: Ord>(vec: &mut Vec<(T, usize)>) {
consolidate_from(vec, 0);
}

Expand All @@ -21,13 +21,13 @@ pub fn consolidate<T: Ord>(vec: &mut Vec<(T, usize)>) {
/// This method will sort `vec[offset..]` and then consolidate runs of more than one entry with
/// identical first elements by accumulating the second elements of the pairs. Should the final
/// accumulation be zero, the element is discarded.
pub fn consolidate_from<T: Ord>(vec: &mut Vec<(T, usize)>, offset: usize) {
fn consolidate_from<T: Ord>(vec: &mut Vec<(T, usize)>, offset: usize) {
let length = consolidate_slice(&mut vec[offset..]);
vec.truncate(offset + length);
}

/// Sorts and consolidates a slice, returning the valid prefix length.
pub fn consolidate_slice<T: Ord>(slice: &mut [(T, usize)]) -> usize {
fn consolidate_slice<T: Ord>(slice: &mut [(T, usize)]) -> usize {
// We could do an insertion-sort like initial scan which builds up sorted, consolidated runs.
// In a world where there are not many results, we may never even need to call in to merge sort.
slice.sort_by(|x, y| x.0.cmp(&y.0));
Expand Down Expand Up @@ -75,7 +75,6 @@ pub fn consolidate_slice<T: Ord>(slice: &mut [(T, usize)]) -> usize {
pub struct CodecRegion<C: Codec, R = CopyRegion<u8>> {
inner: R,
codec: C,
staging: Vec<u8>,
}

impl<C: Codec, R> Region for CodecRegion<C, R>
Expand All @@ -98,7 +97,6 @@ where
Self {
inner: R::merge_regions(regions.map(|r| &r.inner)),
codec,
staging: vec![],
}
}

Expand All @@ -118,6 +116,11 @@ where
fn clear(&mut self) {
self.codec = Default::default();
}

fn heap_size<F: FnMut(usize, usize)>(&self, mut callback: F) {
self.inner.heap_size(&mut callback);
self.codec.heap_size(callback);
}
}

impl<C: Codec, R> CopyOnto<CodecRegion<C, R>> for &[u8]
Expand All @@ -126,9 +129,7 @@ where
for<'a> &'a [u8]: CopyOnto<R>,
{
fn copy_onto(self, target: &mut CodecRegion<C, R>) -> <CodecRegion<C, R> as Region>::Index {
target.staging.clear();
target.codec.encode(self, &mut target.staging);
target.staging.as_slice().copy_onto(&mut target.inner)
target.codec.encode(self, &mut target.inner)
}
}

Expand All @@ -137,16 +138,22 @@ pub trait Codec: Default + 'static {
/// Decodes an input byte slice into a sequence of byte slices.
fn decode<'a>(&'a self, bytes: &'a [u8]) -> &'a [u8];
/// Encodes a sequence of byte slices into an output byte slice.
fn encode(&mut self, bytes: &[u8], output: &mut Vec<u8>);
fn encode<R: Region>(&mut self, bytes: &[u8], output: &mut R) -> R::Index
where
for<'a> &'a [u8]: CopyOnto<R>;
/// Constructs a new instance of `Self` from accumulated statistics.
/// These statistics should cover the data the output expects to see.
fn new_from<'a, I: Iterator<Item = &'a Self> + Clone>(stats: I) -> Self;
/// Diagnostic information about the state of the codec.
fn report(&self) {}

/// Heap size, size - capacity
fn heap_size<F: FnMut(usize, usize)>(&self, callback: F);
}

mod dictionary {

use crate::{CopyOnto, Region};
use std::collections::BTreeMap;

pub use super::{BytesMap, Codec, MisraGries};
Expand Down Expand Up @@ -174,23 +181,26 @@ mod dictionary {
/// Encode a sequence of byte slices.
///
/// Encoding also records statistics about the structure of the input.
fn encode(&mut self, bytes: &[u8], output: &mut Vec<u8>) {
let pre_len = output.len();

fn encode<R: Region>(&mut self, bytes: &[u8], output: &mut R) -> R::Index
where
for<'a> &'a [u8]: CopyOnto<R>,
{
self.total += bytes.len();
// If we have an index referencing `bytes`, use the index key.
if let Some(b) = self.encode.get(bytes) {
output.push(*b);
let index = if let Some(b) = self.encode.get(bytes) {
self.bytes += 1;
[*b].as_slice().copy_onto(output)
} else {
output.extend_from_slice(bytes);
}
self.bytes += bytes.len();
bytes.copy_onto(output)
};
// Stats stuff.
self.stats.0.insert(bytes.to_owned());
let tag = bytes[0];
let tag_idx: usize = (tag % 4).into();
self.stats.1[tag_idx] |= 1 << (tag >> 2);

self.bytes += output.len() - pre_len;
index
}

/// Construct a new encoder from supplied statistics.
Expand Down Expand Up @@ -250,6 +260,10 @@ mod dictionary {
)
// }
}

fn heap_size<F: FnMut(usize, usize)>(&self, _callback: F) {
// Lazy
}
}
}

Expand Down Expand Up @@ -301,8 +315,7 @@ mod misra_gries {
/// Maintains a summary of "heavy hitters" in a presented collection of items.
#[derive(Clone, Debug)]
pub struct MisraGries<T> {
/// TODO
pub inner: Vec<(T, usize)>,
inner: Vec<(T, usize)>,
}

impl<T> Default for MisraGries<T> {
Expand Down
9 changes: 8 additions & 1 deletion src/impls/columns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::{CopyOnto, CopyRegion, Index, Region};
/// vec![],
/// ];
///
/// let mut r = ColumnsRegion::<ConsecutiveOffsetPairs<StringRegion>, _>::default();
/// let mut r = <ColumnsRegion<ConsecutiveOffsetPairs<StringRegion>, _>>::default();
///
/// let mut indices = Vec::with_capacity(data.len());
///
Expand Down Expand Up @@ -113,6 +113,13 @@ where
}
self.indices.clear();
}

fn heap_size<F: FnMut(usize, usize)>(&self, mut callback: F) {
for inner in &self.inner {
inner.heap_size(&mut callback);
}
self.indices.heap_size(callback);
}
}

impl<R, Idx> Default for ColumnsRegion<R, Idx>
Expand Down
50 changes: 38 additions & 12 deletions src/impls/deduplicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ use crate::impls::offsets::OffsetContainer;
use crate::{CopyOnto, Region};

/// A region to deduplicate consecutive equal items.
///
/// # Examples
///
/// The following example shows that two inserts can result in the same index.
/// ```
/// use flatcontainer::impls::deduplicate::CollapseSequence;
/// use flatcontainer::{CopyOnto, StringRegion};
/// let mut r = <CollapseSequence<StringRegion>>::default();
///
/// assert_eq!("abc".copy_onto(&mut r), "abc".copy_onto(&mut r));
/// ```
#[derive(Debug, Clone)]
pub struct CollapseSequence<R: Region> {
/// Inner region.
Expand Down Expand Up @@ -54,6 +65,10 @@ where
self.inner.clear();
self.last_index = None;
}

fn heap_size<F: FnMut(usize, usize)>(&self, callback: F) {
self.inner.heap_size(callback);
}
}

impl<R: Region, T: CopyOnto<R>> CopyOnto<CollapseSequence<R>> for T
Expand All @@ -77,7 +92,19 @@ where
/// be dense, i.e., `(i, j)` is followed by `(j, k)`.
///
/// Defers to region `R` for storing items, and uses offset container `O` to
/// rember indices. By default, `O` is `Vec<usize>`.
/// remeber indices. By default, `O` is `Vec<usize>`.
///
/// # Examples
///
/// The following example shows that two inserts into a copy region have a collapsible index:
/// ```
/// use flatcontainer::impls::deduplicate::{CollapseSequence, ConsecutiveOffsetPairs};
/// use flatcontainer::{CopyOnto, CopyRegion, Region, StringRegion};
/// let mut r = <ConsecutiveOffsetPairs<CopyRegion<u8>>>::default();
///
/// let index: usize = b"abc"[..].copy_onto(&mut r);
/// assert_eq!(b"abc", r.index(index));
/// ```
#[derive(Debug, Clone)]
pub struct ConsecutiveOffsetPairs<R, O = Vec<usize>>
where
Expand Down Expand Up @@ -147,6 +174,11 @@ impl<R: Region<Index = (usize, usize)>, O: OffsetContainer<usize>> Region
self.offsets.clear();
self.offsets.push(0);
}

fn heap_size<F: FnMut(usize, usize)>(&self, mut callback: F) {
self.offsets.heap_size(&mut callback);
self.inner.heap_size(callback);
}
}

impl<R: Region<Index = (usize, usize)>, O: OffsetContainer<usize>, T: CopyOnto<R>>
Expand All @@ -169,7 +201,7 @@ impl<R: Region<Index = (usize, usize)>, O: OffsetContainer<usize>, T: CopyOnto<R
mod tests {
use crate::impls::deduplicate::{CollapseSequence, ConsecutiveOffsetPairs};
use crate::impls::offsets::OffsetOptimized;
use crate::{CopyOnto, FlatStack, Region, StringRegion};
use crate::{CopyOnto, FlatStack, StringRegion};

#[test]
fn test_dedup_flatstack() {
Expand All @@ -178,18 +210,16 @@ mod tests {
fs.copy("abc");
fs.copy("abc");

assert_eq!(2, fs.len());

println!("{fs:?}");
}

#[test]
fn test_dedup_region() {
let mut r = CollapseSequence::<StringRegion>::default();

fn copy<R: Region>(r: &mut R, item: impl CopyOnto<R>) -> R::Index {
item.copy_onto(r)
}

assert_eq!(copy(&mut r, "abc"), copy(&mut r, "abc"));
assert_eq!("abc".copy_onto(&mut r), "abc".copy_onto(&mut r));

println!("{r:?}");
}
Expand All @@ -199,12 +229,8 @@ mod tests {
let mut r =
CollapseSequence::<ConsecutiveOffsetPairs<StringRegion, OffsetOptimized>>::default();

fn copy<R: Region>(r: &mut R, item: impl CopyOnto<R>) -> R::Index {
item.copy_onto(r)
}

for _ in 0..1000 {
copy(&mut r, "abc");
"abc".copy_onto(&mut r);
}

println!("{r:?}");
Expand Down
6 changes: 5 additions & 1 deletion src/impls/mirror.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use crate::{Containerized, CopyOnto, Index, Region, ReserveItems};
/// For [`MirrorRegion`]s, we can index with a copy type:
/// ```
/// # use flatcontainer::{MirrorRegion, Region};
/// let r = MirrorRegion::<u8>::default();
/// let r = <MirrorRegion<u8>>::default();
/// let output: u8 = r.index(42);
/// assert_eq!(output, 42);
/// ```
Expand Down Expand Up @@ -69,6 +69,10 @@ impl<T: Index + CopyOnto<Self>> Region for MirrorRegion<T> {
fn clear(&mut self) {
// No storage
}

fn heap_size<F: FnMut(usize, usize)>(&self, _callback: F) {
// No storage
}
}

impl<T: Index> CopyOnto<MirrorRegion<Self>> for T {
Expand Down
1 change: 1 addition & 0 deletions src/impls/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Various region implementations.

pub mod codec;
pub mod columns;
pub mod deduplicate;
pub mod mirror;
Expand Down
Loading
Loading