From 9f0afc012909fcb290a3103bd837f2a1fc29803d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 21 Feb 2024 22:19:38 -0500 Subject: [PATCH] Fixed columns Signed-off-by: Moritz Hoffmann --- src/impls/fixed_columns.rs | 485 +++++++++++++++++++++++++++++++++++++ src/impls/mod.rs | 1 + 2 files changed, 486 insertions(+) create mode 100644 src/impls/fixed_columns.rs diff --git a/src/impls/fixed_columns.rs b/src/impls/fixed_columns.rs new file mode 100644 index 0000000..83d9bcf --- /dev/null +++ b/src/impls/fixed_columns.rs @@ -0,0 +1,485 @@ +//! A region to contain a variable number of columns. + +use std::fmt::Debug; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::impls::offsets::OffsetContainer; +use crate::CopyIter; +use crate::{CopyOnto, Region}; + +/// A region that can store a fixed number of elements per row. +/// +/// The region is backed by a number of columns, where the number depends on +/// the length of the first row encountered. For pushed row, the region +/// remembers the indices into each column that populated. All rows must have the +/// same length. +/// +/// All columns have the same type `R`, indexes into `R` are stored in an `O`: [`OffsetContainer`]. +/// +/// # Examples +/// +/// Copy a table-like structure: +/// ``` +/// # use flatcontainer::impls::deduplicate::ConsecutiveOffsetPairs; +/// # use flatcontainer::{ColumnsRegion, CopyOnto, Region, StringRegion}; +/// # use flatcontainer::impls::fixed_columns::FixedColumnsRegion; +/// # use flatcontainer::impls::offsets::OffsetOptimized; +/// let data = [ +/// vec!["1", "2", "3"], +/// vec!["4", "5", "6"], +/// vec!["7", "8", "9"], +/// ]; +/// +/// let mut r = , OffsetOptimized>>::default(); +/// +/// let mut indices = Vec::with_capacity(data.len()); +/// +/// for row in &data { +/// let index = row.copy_onto(&mut r); +/// indices.push(index); +/// } +/// +/// # for (&index, row) in indices.iter().zip(&data) { +/// # assert!(row.iter().copied().eq(r.index(index).iter())); +/// # } +/// ``` +#[derive(Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct FixedColumnsRegion { + /// Offsets into individual columns. + offsets: Vec, + /// Storage for columns. + inner: Vec, +} + +impl Region for FixedColumnsRegion +where + R: Region, + O: OffsetContainer, +{ + type ReadItem<'a> = ReadColumns<'a, R, O> where Self: 'a; + type Index = usize; + + fn merge_regions<'a>(regions: impl Iterator + Clone) -> Self + where + Self: 'a, + { + let cols = regions.clone().map(|r| r.inner.len()).max().unwrap_or(0); + let len_iter = regions.clone().map(|r| r.inner.len()).filter(|&l| l > 0); + debug_assert_eq!(len_iter.clone().min(), len_iter.max()); + + let mut inner = Vec::with_capacity(cols); + let mut offsets = Vec::with_capacity(cols); + for col in 0..cols { + inner.push(R::merge_regions( + regions.clone().flat_map(|r| r.inner.get(col)), + )); + offsets.push(O::default()); + } + + Self { inner, offsets } + } + + fn index(&self, index: Self::Index) -> Self::ReadItem<'_> { + ReadColumns { + columns: self, + index, + } + } + + fn reserve_regions<'a, I>(&mut self, regions: I) + where + Self: 'a, + I: Iterator + Clone, + { + for region in regions.clone() { + while self.inner.len() < region.inner.len() { + self.inner.push(R::default()); + } + } + for (index, inner) in self.inner.iter_mut().enumerate() { + inner.reserve_regions(regions.clone().flat_map(|r| r.inner.get(index))) + } + } + + fn clear(&mut self) { + for inner in &mut self.inner { + inner.clear(); + } + for offset in &mut self.offsets { + offset.clear(); + } + } + + fn heap_size(&self, mut callback: F) { + for inner in &self.inner { + inner.heap_size(&mut callback); + } + for offset in &self.offsets { + offset.heap_size(&mut callback); + } + } +} + +impl Default for FixedColumnsRegion +where + R: Region, + O: OffsetContainer, +{ + fn default() -> Self { + Self { + inner: Vec::default(), + offsets: Vec::default(), + } + } +} + +/// Read the values of a row. +pub struct ReadColumns<'a, R, O> { + /// Storage for columns. + columns: &'a FixedColumnsRegion, + /// Row index. + index: usize, +} + +impl<'a, R, O> Clone for ReadColumns<'a, R, O> { + fn clone(&self) -> Self { + *self + } +} + +impl<'a, R, O> Copy for ReadColumns<'a, R, O> {} + +impl<'a, R, O> Debug for ReadColumns<'a, R, O> +where + R: Region, + R::ReadItem<'a>: Debug, + O: OffsetContainer, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_list().entries(self).finish() + } +} + +impl<'a, R, O> ReadColumns<'a, R, O> +where + R: Region, + O: OffsetContainer, +{ + /// Iterate the individual values of a row. + pub fn iter(&'a self) -> ReadColumnsIter<'a, R, O> { + self.into_iter() + } + + /// Get the element at `offset`. + pub fn get(&self, offset: usize) -> R::ReadItem<'a> { + self.columns.inner[offset].index(self.columns.offsets[offset].index(self.index)) + } + + /// Returns the length of this row. + pub fn len(&self) -> usize { + self.columns.inner.len() + } + + /// Returns `true` if this row is empty. + pub fn is_empty(&self) -> bool { + self.columns.inner.is_empty() + } +} + +impl<'a, R, O> IntoIterator for &ReadColumns<'a, R, O> +where + R: Region, + O: OffsetContainer, +{ + type Item = R::ReadItem<'a>; + type IntoIter = ReadColumnsIter<'a, R, O>; + + fn into_iter(self) -> Self::IntoIter { + ReadColumnsIter { + iter: self.columns.inner.iter().zip(self.columns.offsets.iter()), + index: self.index, + } + } +} + +/// An iterator over the elements of a row. +pub struct ReadColumnsIter<'a, R, O> { + iter: std::iter::Zip, std::slice::Iter<'a, O>>, + index: usize, +} + +impl<'a, R, O> Iterator for ReadColumnsIter<'a, R, O> +where + R: Region, + O: OffsetContainer, +{ + type Item = R::ReadItem<'a>; + + fn next(&mut self) -> Option { + self.iter + .next() + .map(|(region, offsets)| region.index(offsets.index(self.index))) + } +} + +impl<'a, R, O> CopyOnto> for ReadColumns<'a, R, O> +where + R: Region, + O: OffsetContainer, +{ + fn copy_onto( + self, + target: &mut FixedColumnsRegion, + ) -> as Region>::Index { + // Ensure all required regions exist. + debug_assert!( + target.inner.is_empty() || self.len() == target.inner.len(), + "All elements in a fixed columns region must have equal length." + ); + while target.inner.len() < self.len() { + target.inner.push(R::default()); + target.offsets.push(O::default()); + } + for ((item, region), offsets) in self + .iter() + .zip(target.inner.iter_mut()) + .zip(target.offsets.iter_mut()) + { + let index = item.copy_onto(region); + offsets.push(index); + } + target.offsets.first().map(|o| o.len() - 1).unwrap_or(0) + } +} + +impl<'a, R, O, T> CopyOnto> for &'a [T] +where + R: Region, + O: OffsetContainer, + &'a T: CopyOnto, +{ + fn copy_onto( + self, + target: &mut FixedColumnsRegion, + ) -> as Region>::Index { + // Ensure all required regions exist. + debug_assert!( + target.inner.is_empty() || self.len() == target.inner.len(), + "All elements in a fixed columns region must have equal length." + ); + while target.inner.len() < self.len() { + target.inner.push(R::default()); + target.offsets.push(O::default()); + } + for ((item, region), offsets) in self + .iter() + .zip(target.inner.iter_mut()) + .zip(target.offsets.iter_mut()) + { + let index = item.copy_onto(region); + offsets.push(index); + } + target.offsets.first().map(|o| o.len() - 1).unwrap_or(0) + } +} + +impl CopyOnto> for Vec +where + R: Region, + O: OffsetContainer, + T: CopyOnto, +{ + fn copy_onto( + self, + target: &mut FixedColumnsRegion, + ) -> as Region>::Index { + // Ensure all required regions exist. + debug_assert!( + target.inner.is_empty() || self.len() == target.inner.len(), + "All elements in a fixed columns region must have equal length." + ); + while target.inner.len() < self.len() { + target.inner.push(R::default()); + target.offsets.push(O::default()); + } + for ((item, region), offsets) in self + .into_iter() + .zip(target.inner.iter_mut()) + .zip(target.offsets.iter_mut()) + { + let index = item.copy_onto(region); + offsets.push(index); + } + target.offsets.first().map(|o| o.len() - 1).unwrap_or(0) + } +} + +impl<'a, R, O, T> CopyOnto> for &'a Vec +where + R: Region, + O: OffsetContainer, + &'a T: CopyOnto, +{ + fn copy_onto( + self, + target: &mut FixedColumnsRegion, + ) -> as Region>::Index { + // Ensure all required regions exist. + debug_assert!( + target.inner.is_empty() || self.len() == target.inner.len(), + "All elements in a fixed columns region must have equal length." + ); + while target.inner.len() < self.len() { + target.inner.push(R::default()); + target.offsets.push(O::default()); + } + for (index, offsets) in self + .iter() + .zip(target.inner.iter_mut()) + .map(|(item, region)| item.copy_onto(region)) + .zip(target.offsets.iter_mut()) + { + offsets.push(index); + } + target.offsets.first().map(|o| o.len() - 1).unwrap_or(0) + } +} + +impl CopyOnto> for CopyIter +where + R: Region, + O: OffsetContainer, + T: CopyOnto, + I: IntoIterator, +{ + #[inline] + fn copy_onto( + self, + target: &mut FixedColumnsRegion, + ) -> as Region>::Index { + let check_length = !target.inner.is_empty(); + let mut len = 0; + for (column, value) in self.0.into_iter().enumerate() { + // Ensure all required regions exist. + if check_length { + debug_assert!(column < target.inner.len()) + } else { + debug_assert!(target.inner.len() <= column); + target.inner.push(R::default()); + target.offsets.push(O::default()); + } + let index = value.copy_onto(&mut target.inner[column]); + target.offsets[column].push(index); + len += 1; + } + debug_assert_eq!(len, target.inner.len()); + target.offsets.first().map(|o| o.len() - 1).unwrap_or(0) + } +} + +#[cfg(test)] +mod tests { + use crate::impls::deduplicate::{CollapseSequence, ConsecutiveOffsetPairs}; + use crate::impls::fixed_columns::FixedColumnsRegion; + use crate::impls::offsets::OffsetOptimized; + use crate::CopyIter; + use crate::{CopyOnto, MirrorRegion, Region, StringRegion}; + + #[test] + fn test_matrix() { + let data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]; + + let mut r = FixedColumnsRegion::, OffsetOptimized>::default(); + + let mut indices = Vec::with_capacity(data.len()); + + for row in &data { + let index = row.as_slice().copy_onto(&mut r); + indices.push(index); + } + + for (&index, row) in indices.iter().zip(&data) { + assert!(row.iter().copied().eq(r.index(index).iter())); + } + } + + #[test] + fn test_string_vec() { + let data = vec![ + vec!["1".to_string(), "2".to_string(), "3".to_string()], + vec!["4".to_string(), "5".to_string(), "6".to_string()], + vec!["7".to_string(), "8".to_string(), "9".to_string()], + ]; + + let mut r = FixedColumnsRegion::< + CollapseSequence>, + OffsetOptimized, + >::default(); + + let mut indices = Vec::with_capacity(data.len()); + + for row in &data { + let index = row.copy_onto(&mut r); + indices.push(index); + } + + for (&index, row) in indices.iter().zip(&data) { + assert!(row.iter().eq(r.index(index).iter())); + } + + println!("{r:?}"); + } + + #[test] + fn test_str_vec() { + let data = [ + vec!["1", "2", "3"], + vec!["4", "5", "6"], + vec!["7", "8", "9"], + ]; + + let mut r = + FixedColumnsRegion::, OffsetOptimized>::default(); + + let mut indices = Vec::with_capacity(data.len()); + + for row in &data { + let index = row.copy_onto(&mut r); + indices.push(index); + } + + for (&index, row) in indices.iter().zip(&data) { + assert!(row.iter().copied().eq(r.index(index).iter())); + } + + println!("{r:?}"); + } + + #[test] + fn test_str_iter() { + let data = [ + vec!["1", "2", "3"], + vec!["4", "5", "6"], + vec!["7", "8", "9"], + ]; + + let mut r = + FixedColumnsRegion::, OffsetOptimized>::default(); + + let mut indices = Vec::with_capacity(data.len()); + + for row in &data { + let index = CopyIter(row.iter()).copy_onto(&mut r); + indices.push(index); + } + + for (&index, row) in indices.iter().zip(&data) { + assert!(row.iter().copied().eq(r.index(index).iter())); + } + + println!("{r:?}"); + } +} diff --git a/src/impls/mod.rs b/src/impls/mod.rs index be5a6a1..7109da0 100644 --- a/src/impls/mod.rs +++ b/src/impls/mod.rs @@ -3,6 +3,7 @@ pub mod codec; pub mod columns; pub mod deduplicate; +pub mod fixed_columns; pub mod mirror; pub mod offsets; pub mod option;