Skip to content

Commit

Permalink
no more encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
teh-cmc committed Oct 14, 2024
1 parent ca6ad40 commit b5a4456
Show file tree
Hide file tree
Showing 5 changed files with 5 additions and 121 deletions.
85 changes: 1 addition & 84 deletions crates/store/re_chunk_store/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,47 +15,6 @@ use crate::{ChunkStore, ColumnMetadata};

// --- Descriptors ---

/// When selecting secondary component columns, specify how the joined data should be encoded.
///
/// Because range-queries often involve repeating the same joined-in data multiple times,
/// the strategy we choose for joining can have a significant impact on the size and memory
/// overhead of the `RecordBatch`.
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
pub enum JoinEncoding {
/// Slice the `RecordBatch` to minimal overlapping sub-ranges.
///
/// This is the default, and should always be used for the POV component which defines
/// the optimal size for `RecordBatch`.
///
/// This minimizes the need for allocation, but at the cost of `RecordBatch`es that are
/// almost always smaller than the optimal size. In the common worst-case, this will result
/// in single-row `RecordBatch`es.
#[default]
OverlappingSlice,

/// Dictionary-encode the joined column.
///
/// Using dictionary-encoding allows any repeated data to be shared between rows,
/// but comes with the cost of an extra dictionary-lookup indirection.
///
/// Note that this changes the physical type of the returned column.
///
/// Using this encoding for complex types is incompatible with some arrow libraries.
DictionaryEncode,
//
// TODO(jleibs):
// RepeatCopy,
//
// Repeat the joined column by physically copying the data.
//
// This will always allocate a new column in the `RecordBatch`, matching the size of the
// POV component.
//
// This is the most expensive option, but can make working with the data more efficient,
// especially when the copied column is small.
//
}

// TODO(#6889): At some point all these descriptors needs to be interned and have handles or
// something. And of course they need to be codegen. But we'll get there once we're back to
// natively tagged components.
Expand Down Expand Up @@ -184,11 +143,6 @@ pub struct ComponentColumnDescriptor {
/// we introduce mono-type optimization, this might be a native type instead.
pub store_datatype: ArrowDatatype,

/// How the data will be joined into the resulting `RecordBatch`.
//
// TODO(cmc): remove with the old re_dataframe.
pub join_encoding: JoinEncoding,

/// Whether this column represents static data.
pub is_static: bool,

Expand Down Expand Up @@ -219,7 +173,6 @@ impl Ord for ComponentColumnDescriptor {
archetype_name,
archetype_field_name,
component_name,
join_encoding: _,
store_datatype: _,
is_static: _,
is_indicator: _,
Expand All @@ -242,7 +195,6 @@ impl std::fmt::Display for ComponentColumnDescriptor {
archetype_name,
archetype_field_name,
component_name,
join_encoding: _,
store_datatype: _,
is_static,
is_indicator: _,
Expand Down Expand Up @@ -288,7 +240,6 @@ impl ComponentColumnDescriptor {
archetype_name,
archetype_field_name,
component_name,
join_encoding: _,
store_datatype: _,
is_static,
is_indicator,
Expand Down Expand Up @@ -324,14 +275,7 @@ impl ComponentColumnDescriptor {

#[inline]
pub fn returned_datatype(&self) -> ArrowDatatype {
match self.join_encoding {
JoinEncoding::OverlappingSlice => self.store_datatype.clone(),
JoinEncoding::DictionaryEncode => ArrowDatatype::Dictionary(
arrow2::datatypes::IntegerType::Int32,
std::sync::Arc::new(self.store_datatype.clone()),
true,
),
}
self.store_datatype.clone()
}

#[inline]
Expand All @@ -348,12 +292,6 @@ impl ComponentColumnDescriptor {
// TODO(#6889): This needs some proper sorbetization -- I just threw these names randomly.
.with_metadata(self.metadata())
}

#[inline]
pub fn with_join_encoding(mut self, join_encoding: JoinEncoding) -> Self {
self.join_encoding = join_encoding;
self
}
}

// --- Selectors ---
Expand Down Expand Up @@ -426,11 +364,6 @@ pub struct ComponentColumnSelector {
/// This string will be flexibly matched against the available component names.
/// Valid matches are case invariant matches of either the full name or the short name.
pub component_name: String,

/// How to join the data into the `RecordBatch`.
//
// TODO(cmc): remove once old `re_dataframe` is gone.
pub join_encoding: JoinEncoding,
}

impl From<ComponentColumnDescriptor> for ComponentColumnSelector {
Expand All @@ -439,7 +372,6 @@ impl From<ComponentColumnDescriptor> for ComponentColumnSelector {
Self {
entity_path: desc.entity_path.clone(),
component_name: desc.component_name.to_string(),
join_encoding: desc.join_encoding,
}
}
}
Expand All @@ -451,7 +383,6 @@ impl ComponentColumnSelector {
Self {
entity_path,
component_name: C::name().to_string(),
join_encoding: JoinEncoding::default(),
}
}

Expand All @@ -461,24 +392,15 @@ impl ComponentColumnSelector {
Self {
entity_path,
component_name: component_name.to_string(),
join_encoding: JoinEncoding::default(),
}
}

/// Specify how the data should be joined into the `RecordBatch`.
#[inline]
pub fn with_join_encoding(mut self, join_encoding: JoinEncoding) -> Self {
self.join_encoding = join_encoding;
self
}
}

impl std::fmt::Display for ComponentColumnSelector {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let Self {
entity_path,
component_name,
join_encoding: _,
} = self;

f.write_fmt(format_args!("{entity_path}:{component_name}"))
Expand All @@ -497,9 +419,6 @@ pub struct ArchetypeFieldColumnSelector {
/// The field within the `Archetype` associated with this data.
field: String,
/// How to join the data into the `RecordBatch`.
join_encoding: JoinEncoding,
}
*/

Expand Down Expand Up @@ -743,7 +662,6 @@ impl ChunkStore {
// It might be wrapped further in e.g. a dict, but at the very least
// it's a list.
store_datatype: ArrowListArray::<i32>::default_datatype(datatype.clone()),
join_encoding: JoinEncoding::default(),
is_static,
is_indicator,
is_tombstone,
Expand Down Expand Up @@ -827,7 +745,6 @@ impl ChunkStore {
archetype_field_name: None,
component_name,
store_datatype: ArrowListArray::<i32>::default_datatype(datatype.clone()),
join_encoding: selector.join_encoding,
is_static,
is_indicator,
is_tombstone,
Expand Down
4 changes: 2 additions & 2 deletions crates/store/re_chunk_store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ mod writes;

pub use self::dataframe::{
ColumnDescriptor, ColumnSelector, ComponentColumnDescriptor, ComponentColumnSelector, Index,
IndexRange, IndexValue, JoinEncoding, QueryExpression, SparseFillStrategy,
TimeColumnDescriptor, TimeColumnSelector, ViewContentsSelector,
IndexRange, IndexValue, QueryExpression, SparseFillStrategy, TimeColumnDescriptor,
TimeColumnSelector, ViewContentsSelector,
};
pub use self::events::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent};
pub use self::gc::{GarbageCollectionOptions, GarbageCollectionTarget};
Expand Down
17 changes: 1 addition & 16 deletions crates/store/re_dataframe/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ use re_chunk::{
};
use re_chunk_store::{
ColumnDescriptor, ColumnSelector, ComponentColumnDescriptor, ComponentColumnSelector, Index,
IndexValue, JoinEncoding, QueryExpression, SparseFillStrategy, TimeColumnDescriptor,
TimeColumnSelector,
IndexValue, QueryExpression, SparseFillStrategy, TimeColumnDescriptor, TimeColumnSelector,
};
use re_log_types::ResolvedTimeRange;
use re_types_core::components::ClearIsRecursive;
Expand Down Expand Up @@ -399,7 +398,6 @@ impl QueryHandle<'_> {
let ComponentColumnSelector {
entity_path: selected_entity_path,
component_name: selected_component_name,
join_encoding: _,
} = selected_column;

view_contents
Expand All @@ -425,7 +423,6 @@ impl QueryHandle<'_> {
selected_component_name.clone(),
),
store_datatype: arrow2::datatypes::DataType::Null,
join_encoding: JoinEncoding::default(),
is_static: false,
is_indicator: false,
is_tombstone: false,
Expand Down Expand Up @@ -1575,7 +1572,6 @@ mod tests {
filtered_point_of_view: Some(ComponentColumnSelector {
entity_path: "no/such/entity".into(),
component_name: MyPoint::name().to_string(),
join_encoding: Default::default(),
}),
..Default::default()
};
Expand Down Expand Up @@ -1605,7 +1601,6 @@ mod tests {
filtered_point_of_view: Some(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: "AComponentColumnThatDoesntExist".into(),
join_encoding: Default::default(),
}),
..Default::default()
};
Expand Down Expand Up @@ -1635,7 +1630,6 @@ mod tests {
filtered_point_of_view: Some(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyPoint::name().to_string(),
join_encoding: Default::default(),
}),
..Default::default()
};
Expand Down Expand Up @@ -1675,7 +1669,6 @@ mod tests {
filtered_point_of_view: Some(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyColor::name().to_string(),
join_encoding: Default::default(),
}),
..Default::default()
};
Expand Down Expand Up @@ -1901,22 +1894,18 @@ mod tests {
ColumnSelector::Component(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyColor::name().to_string(),
join_encoding: Default::default(),
}),
ColumnSelector::Component(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyColor::name().to_string(),
join_encoding: Default::default(),
}),
ColumnSelector::Component(ComponentColumnSelector {
entity_path: "non_existing_entity".into(),
component_name: MyColor::name().to_string(),
join_encoding: Default::default(),
}),
ColumnSelector::Component(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: "AComponentColumnThatDoesntExist".into(),
join_encoding: Default::default(),
}),
]),
..Default::default()
Expand Down Expand Up @@ -1993,17 +1982,14 @@ mod tests {
ColumnSelector::Component(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyPoint::name().to_string(),
join_encoding: Default::default(),
}),
ColumnSelector::Component(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyColor::name().to_string(),
join_encoding: Default::default(),
}),
ColumnSelector::Component(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyLabel::name().to_string(),
join_encoding: Default::default(),
}),
]),
..Default::default()
Expand Down Expand Up @@ -2197,7 +2183,6 @@ mod tests {
filtered_point_of_view: Some(ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: MyPoint::name().to_string(),
join_encoding: Default::default(),
}),
..Default::default()
};
Expand Down
2 changes: 1 addition & 1 deletion rerun_notebook/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 0 additions & 18 deletions rerun_py/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,6 @@ impl From<ComponentColumnDescriptor> for PyComponentColumnDescriptor {

#[pymethods]
impl PyComponentColumnDescriptor {
fn with_dictionary_encoding(&self) -> Self {
Self(
self.0
.clone()
.with_join_encoding(re_chunk_store::JoinEncoding::DictionaryEncode),
)
}

fn __repr__(&self) -> String {
format!(
"Component({}:{})",
Expand Down Expand Up @@ -134,18 +126,9 @@ impl PyComponentColumnSelector {
Self(ComponentColumnSelector {
entity_path: entity_path.into(),
component_name: component_name.0,
join_encoding: Default::default(),
})
}

fn with_dictionary_encoding(&self) -> Self {
Self(
self.0
.clone()
.with_join_encoding(re_chunk_store::JoinEncoding::DictionaryEncode),
)
}

fn __repr__(&self) -> String {
format!(
"Component({}:{})",
Expand Down Expand Up @@ -630,7 +613,6 @@ impl PyRecording {
let selector = ComponentColumnSelector {
entity_path: entity_path.clone(),
component_name: component_name.into(),
join_encoding: Default::default(),
};

self.store
Expand Down

0 comments on commit b5a4456

Please sign in to comment.