From 670a94f118d61462b720d51d62f164986d23cb23 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Thu, 5 Sep 2024 13:29:27 +0800 Subject: [PATCH 01/32] feat: variable vnode count support in table distribution (#18373) Signed-off-by: Bugen Zhao --- .../executor/join/distributed_lookup_join.rs | 8 +- .../src/executor/join/local_lookup_join.rs | 9 +- src/batch/src/executor/log_row_seq_scan.rs | 6 +- src/batch/src/executor/row_seq_scan.rs | 5 +- src/common/src/hash/consistent_hash/bitmap.rs | 14 +++ src/common/src/hash/consistent_hash/vnode.rs | 5 + src/common/src/hash/table_distribution.rs | 119 ++++++++---------- src/common/src/util/scan_range.rs | 4 +- src/ctl/src/cmd_impl/table/scan.rs | 8 +- src/frontend/src/scheduler/plan_fragmenter.rs | 3 +- .../hummock_test/src/state_store_tests.rs | 5 +- .../src/hummock/iterator/change_log.rs | 5 +- .../log_store_impl/kv_log_store/serde.rs | 22 ++-- .../src/common/table/test_state_table.rs | 16 +-- src/stream/src/executor/watermark_filter.rs | 13 +- src/stream/src/from_proto/mview.rs | 2 +- src/stream/src/from_proto/watermark_filter.rs | 5 +- 17 files changed, 127 insertions(+), 122 deletions(-) diff --git a/src/batch/src/executor/join/distributed_lookup_join.rs b/src/batch/src/executor/join/distributed_lookup_join.rs index 1068ffd7f3349..74d7843013e4d 100644 --- a/src/batch/src/executor/join/distributed_lookup_join.rs +++ b/src/batch/src/executor/join/distributed_lookup_join.rs @@ -17,8 +17,9 @@ use std::mem::swap; use futures::pin_mut; use itertools::Itertools; +use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, Schema}; -use risingwave_common::hash::{HashKey, HashKeyDispatcher}; +use risingwave_common::hash::{HashKey, HashKeyDispatcher, VirtualNode}; use risingwave_common::memory::MemoryContext; use risingwave_common::row::OwnedRow; use risingwave_common::types::{DataType, Datum}; @@ -30,7 +31,7 @@ use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::common::BatchQueryEpoch; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::{TableDistribution, TableIter}; +use risingwave_storage::table::TableIter; use risingwave_storage::{dispatch_state_store, StateStore}; use crate::error::Result; @@ -194,7 +195,8 @@ impl BoxedExecutorBuilder for DistributedLookupJoinExecutorBuilder { .collect(); // Lookup Join always contains distribution key, so we don't need vnode bitmap - let vnodes = Some(TableDistribution::all_vnodes()); + // TODO(var-vnode): use vnode count from table desc + let vnodes = Some(Bitmap::ones(VirtualNode::COUNT).into()); dispatch_state_store!(source.context().state_store(), state_store, { let table = StorageTable::new_partial(state_store, column_ids, vnodes, table_desc); let inner_side_builder = InnerSideExecutorBuilder::new( diff --git a/src/batch/src/executor/join/local_lookup_join.rs b/src/batch/src/executor/join/local_lookup_join.rs index a3be00fc39a22..7c7a08af5d873 100644 --- a/src/batch/src/executor/join/local_lookup_join.rs +++ b/src/batch/src/executor/join/local_lookup_join.rs @@ -17,7 +17,7 @@ use std::marker::PhantomData; use anyhow::Context; use itertools::Itertools; -use risingwave_common::bitmap::BitmapBuilder; +use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::{ColumnDesc, Field, Schema}; use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::{ @@ -408,12 +408,11 @@ impl BoxedExecutorBuilder for LocalLookupJoinExecutorBuilder { }) .collect(); + // TODO(var-vnode): use vnode count from table desc + let vnodes = Some(Bitmap::ones(VirtualNode::COUNT).into()); let inner_side_builder = InnerSideExecutorBuilder { table_desc: table_desc.clone(), - table_distribution: TableDistribution::new_from_storage_table_desc( - Some(TableDistribution::all_vnodes()), - table_desc, - ), + table_distribution: TableDistribution::new_from_storage_table_desc(vnodes, table_desc), vnode_mapping, outer_side_key_types, inner_side_schema, diff --git a/src/batch/src/executor/log_row_seq_scan.rs b/src/batch/src/executor/log_row_seq_scan.rs index 7106eaec1b760..be2a11b756946 100644 --- a/src/batch/src/executor/log_row_seq_scan.rs +++ b/src/batch/src/executor/log_row_seq_scan.rs @@ -22,13 +22,14 @@ use prometheus::Histogram; use risingwave_common::array::{DataChunk, Op}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnId, Field, Schema}; +use risingwave_common::hash::VirtualNode; use risingwave_common::row::{Row, RowExt}; use risingwave_common::types::ScalarImpl; use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::common::{batch_query_epoch, BatchQueryEpoch}; use risingwave_pb::plan_common::StorageTableDesc; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::{collect_data_chunk, TableDistribution}; +use risingwave_storage::table::collect_data_chunk; use risingwave_storage::{dispatch_state_store, StateStore}; use super::{BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder}; @@ -106,7 +107,8 @@ impl BoxedExecutorBuilder for LogStoreRowSeqScanExecutorBuilder { Some(vnodes) => Some(Bitmap::from(vnodes).into()), // This is possible for dml. vnode_bitmap is not filled by scheduler. // Or it's single distribution, e.g., distinct agg. We scan in a single executor. - None => Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + None => Some(Bitmap::ones(VirtualNode::COUNT).into()), }; let chunk_size = source.context.get_config().developer.chunk_size as u32; diff --git a/src/batch/src/executor/row_seq_scan.rs b/src/batch/src/executor/row_seq_scan.rs index b897dbd813787..7c7244d954764 100644 --- a/src/batch/src/executor/row_seq_scan.rs +++ b/src/batch/src/executor/row_seq_scan.rs @@ -21,6 +21,7 @@ use prometheus::Histogram; use risingwave_common::array::DataChunk; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnId, Schema}; +use risingwave_common::hash::VirtualNode; use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::{DataType, Datum}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -32,7 +33,6 @@ use risingwave_pb::plan_common::as_of::AsOfType; use risingwave_pb::plan_common::{as_of, PbAsOf, StorageTableDesc}; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use risingwave_storage::{dispatch_state_store, StateStore}; use crate::error::{BatchError, Result}; @@ -210,7 +210,8 @@ impl BoxedExecutorBuilder for RowSeqScanExecutorBuilder { Some(vnodes) => Some(Bitmap::from(vnodes).into()), // This is possible for dml. vnode_bitmap is not filled by scheduler. // Or it's single distribution, e.g., distinct agg. We scan in a single executor. - None => Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + None => Some(Bitmap::ones(VirtualNode::COUNT).into()), }; let scan_ranges = { diff --git a/src/common/src/hash/consistent_hash/bitmap.rs b/src/common/src/hash/consistent_hash/bitmap.rs index 773231ba36a89..eee6a64a2b42c 100644 --- a/src/common/src/hash/consistent_hash/bitmap.rs +++ b/src/common/src/hash/consistent_hash/bitmap.rs @@ -15,6 +15,7 @@ use std::ops::RangeInclusive; use crate::bitmap::Bitmap; +use crate::hash::table_distribution::SINGLETON_VNODE; use crate::hash::VirtualNode; /// An extension trait for `Bitmap` to support virtual node operations. @@ -36,4 +37,17 @@ impl Bitmap { self.high_ranges() .map(|r| (VirtualNode::from_index(*r.start())..=VirtualNode::from_index(*r.end()))) } + + /// Returns whether only the [`SINGLETON_VNODE`] is set in the bitmap. + /// + /// Note that this method returning `true` does not imply that the bitmap was created by + /// [`VnodeBitmapExt::singleton`], or that the bitmap has length 1. + pub fn is_singleton(&self) -> bool { + self.count_ones() == 1 && self.iter_vnodes().next().unwrap() == SINGLETON_VNODE + } + + /// Creates a bitmap with length 1 and the single bit set. + pub fn singleton() -> Self { + Self::ones(1) + } } diff --git a/src/common/src/hash/consistent_hash/vnode.rs b/src/common/src/hash/consistent_hash/vnode.rs index f528544689f31..dd4095535fdf3 100644 --- a/src/common/src/hash/consistent_hash/vnode.rs +++ b/src/common/src/hash/consistent_hash/vnode.rs @@ -114,6 +114,11 @@ impl VirtualNode { } } +impl VirtualNode { + pub const COUNT_FOR_TEST: usize = Self::COUNT; + pub const MAX_FOR_TEST: VirtualNode = Self::MAX; +} + impl VirtualNode { // `compute_chunk` is used to calculate the `VirtualNode` for the columns in the // chunk. When only one column is provided and its type is `Serial`, we consider the column to diff --git a/src/common/src/hash/table_distribution.rs b/src/common/src/hash/table_distribution.rs index 9be9cd2abafb2..480483bc96a5d 100644 --- a/src/common/src/hash/table_distribution.rs +++ b/src/common/src/hash/table_distribution.rs @@ -13,30 +13,34 @@ // limitations under the License. use std::mem::replace; -use std::ops::Deref; use std::sync::{Arc, LazyLock}; use itertools::Itertools; use risingwave_pb::plan_common::StorageTableDesc; -use tracing::warn; use crate::array::{Array, DataChunk, PrimitiveArray}; -use crate::bitmap::{Bitmap, BitmapBuilder}; +use crate::bitmap::Bitmap; use crate::hash::VirtualNode; use crate::row::Row; use crate::util::iter_util::ZipEqFast; -/// For tables without distribution (singleton), the `DEFAULT_VNODE` is encoded. -pub const DEFAULT_VNODE: VirtualNode = VirtualNode::ZERO; +/// For tables without distribution (singleton), the `SINGLETON_VNODE` is encoded. +pub const SINGLETON_VNODE: VirtualNode = VirtualNode::ZERO; + +use super::VnodeBitmapExt; #[derive(Debug, Clone)] enum ComputeVnode { Singleton, DistKeyIndices { + /// Virtual nodes that the table is partitioned into. + vnodes: Arc, /// Indices of distribution key for computing vnode, based on the pk columns of the table. dist_key_in_pk_indices: Vec, }, VnodeColumnIndex { + /// Virtual nodes that the table is partitioned into. + vnodes: Arc, /// Index of vnode column. vnode_col_idx_in_pk: usize, }, @@ -47,13 +51,8 @@ enum ComputeVnode { pub struct TableDistribution { /// The way to compute vnode provided primary key compute_vnode: ComputeVnode, - - /// Virtual nodes that the table is partitioned into. - vnodes: Arc, } -pub const SINGLETON_VNODE: VirtualNode = DEFAULT_VNODE; - impl TableDistribution { pub fn new_from_storage_table_desc( vnodes: Option>, @@ -75,69 +74,32 @@ impl TableDistribution { ) -> Self { let compute_vnode = if let Some(vnode_col_idx_in_pk) = vnode_col_idx_in_pk { ComputeVnode::VnodeColumnIndex { + vnodes: vnodes.unwrap_or_else(|| Bitmap::singleton().into()), vnode_col_idx_in_pk, } } else if !dist_key_in_pk_indices.is_empty() { ComputeVnode::DistKeyIndices { + vnodes: vnodes.expect("vnodes must be `Some` as dist key indices are set"), dist_key_in_pk_indices, } } else { ComputeVnode::Singleton }; - let vnodes = vnodes.unwrap_or_else(Self::singleton_vnode_bitmap); - if let ComputeVnode::Singleton = &compute_vnode { - if &vnodes != Self::singleton_vnode_bitmap_ref() && &vnodes != Self::all_vnodes_ref() { - warn!( - ?vnodes, - "singleton distribution get non-singleton vnode bitmap" - ); - } - } - - Self { - compute_vnode, - vnodes, - } + Self { compute_vnode } } pub fn is_singleton(&self) -> bool { matches!(&self.compute_vnode, ComputeVnode::Singleton) } - pub fn singleton_vnode_bitmap_ref() -> &'static Arc { - /// A bitmap that only the default vnode is set. - static SINGLETON_VNODES: LazyLock> = LazyLock::new(|| { - let mut vnodes = BitmapBuilder::zeroed(VirtualNode::COUNT); - vnodes.set(SINGLETON_VNODE.to_index(), true); - vnodes.finish().into() - }); - - SINGLETON_VNODES.deref() - } - - pub fn singleton_vnode_bitmap() -> Arc { - Self::singleton_vnode_bitmap_ref().clone() - } - - pub fn all_vnodes_ref() -> &'static Arc { - /// A bitmap that all vnodes are set. - static ALL_VNODES: LazyLock> = - LazyLock::new(|| Bitmap::ones(VirtualNode::COUNT).into()); - &ALL_VNODES - } - - pub fn all_vnodes() -> Arc { - Self::all_vnodes_ref().clone() - } - /// Distribution that accesses all vnodes, mainly used for tests. - pub fn all(dist_key_in_pk_indices: Vec) -> Self { + pub fn all(dist_key_in_pk_indices: Vec, vnode_count: usize) -> Self { Self { compute_vnode: ComputeVnode::DistKeyIndices { + vnodes: Bitmap::ones(vnode_count).into(), dist_key_in_pk_indices, }, - vnodes: Self::all_vnodes(), } } @@ -145,20 +107,39 @@ impl TableDistribution { pub fn singleton() -> Self { Self { compute_vnode: ComputeVnode::Singleton, - vnodes: Self::singleton_vnode_bitmap(), } } pub fn update_vnode_bitmap(&mut self, new_vnodes: Arc) -> Arc { - if self.is_singleton() && &new_vnodes != Self::singleton_vnode_bitmap_ref() { - warn!(?new_vnodes, "update vnode on singleton distribution"); + match &mut self.compute_vnode { + ComputeVnode::Singleton => { + if !new_vnodes.is_singleton() { + panic!( + "update vnode bitmap on singleton distribution to non-singleton: {:?}", + new_vnodes + ); + } + self.vnodes().clone() // not updated + } + + ComputeVnode::DistKeyIndices { vnodes, .. } + | ComputeVnode::VnodeColumnIndex { vnodes, .. } => { + assert_eq!(vnodes.len(), new_vnodes.len()); + replace(vnodes, new_vnodes) + } } - assert_eq!(self.vnodes.len(), new_vnodes.len()); - replace(&mut self.vnodes, new_vnodes) } + /// Get vnode bitmap if distributed, or a dummy [`Bitmap::singleton()`] if singleton. pub fn vnodes(&self) -> &Arc { - &self.vnodes + static SINGLETON_VNODES: LazyLock> = + LazyLock::new(|| Bitmap::singleton().into()); + + match &self.compute_vnode { + ComputeVnode::DistKeyIndices { vnodes, .. } => vnodes, + ComputeVnode::VnodeColumnIndex { vnodes, .. } => vnodes, + ComputeVnode::Singleton => &SINGLETON_VNODES, + } } /// Get vnode value with given primary key. @@ -166,11 +147,13 @@ impl TableDistribution { match &self.compute_vnode { ComputeVnode::Singleton => SINGLETON_VNODE, ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, - } => compute_vnode(pk, dist_key_in_pk_indices, &self.vnodes), + } => compute_vnode(pk, dist_key_in_pk_indices, vnodes), ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, - } => get_vnode_from_row(pk, *vnode_col_idx_in_pk, &self.vnodes), + } => get_vnode_from_row(pk, *vnode_col_idx_in_pk, vnodes), } } @@ -178,22 +161,20 @@ impl TableDistribution { match &self.compute_vnode { ComputeVnode::Singleton => Some(SINGLETON_VNODE), ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, } => dist_key_in_pk_indices .iter() .all(|&d| d < pk_prefix.len()) - .then(|| compute_vnode(pk_prefix, dist_key_in_pk_indices, &self.vnodes)), + .then(|| compute_vnode(pk_prefix, dist_key_in_pk_indices, vnodes)), ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, } => { if *vnode_col_idx_in_pk >= pk_prefix.len() { None } else { - Some(get_vnode_from_row( - pk_prefix, - *vnode_col_idx_in_pk, - &self.vnodes, - )) + Some(get_vnode_from_row(pk_prefix, *vnode_col_idx_in_pk, vnodes)) } } } @@ -230,6 +211,7 @@ impl TableDistribution { vec![SINGLETON_VNODE; chunk.capacity()] } ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, } => { let dist_key_indices = dist_key_in_pk_indices @@ -243,13 +225,14 @@ impl TableDistribution { .map(|(vnode, vis)| { // Ignore the invisible rows. if vis { - check_vnode_is_set(vnode, &self.vnodes); + check_vnode_is_set(vnode, vnodes); } vnode }) .collect() } ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, } => { let array: &PrimitiveArray = @@ -262,7 +245,7 @@ impl TableDistribution { let vnode = VirtualNode::from_scalar(vnode); if vis { assert!(exist); - check_vnode_is_set(vnode, &self.vnodes); + check_vnode_is_set(vnode, vnodes); } vnode }) diff --git a/src/common/src/util/scan_range.rs b/src/common/src/util/scan_range.rs index fd056f1790444..5d5e84ed32085 100644 --- a/src/common/src/util/scan_range.rs +++ b/src/common/src/util/scan_range.rs @@ -159,7 +159,7 @@ mod tests { let pk = vec![1, 3, 2]; let dist_key_idx_in_pk = crate::catalog::get_dist_key_in_pk_indices(&dist_key, &pk).unwrap(); - let dist = TableDistribution::all(dist_key_idx_in_pk); + let dist = TableDistribution::all(dist_key_idx_in_pk, VirtualNode::COUNT_FOR_TEST); let mut scan_range = ScanRange::full_table_scan(); assert!(scan_range.try_compute_vnode(&dist).is_none()); @@ -185,7 +185,7 @@ mod tests { let pk = vec![1, 3, 2]; let dist_key_idx_in_pk = crate::catalog::get_dist_key_in_pk_indices(&dist_key, &pk).unwrap(); - let dist = TableDistribution::all(dist_key_idx_in_pk); + let dist = TableDistribution::all(dist_key_idx_in_pk, VirtualNode::COUNT_FOR_TEST); let mut scan_range = ScanRange::full_table_scan(); assert!(scan_range.try_compute_vnode(&dist).is_none()); diff --git a/src/ctl/src/cmd_impl/table/scan.rs b/src/ctl/src/cmd_impl/table/scan.rs index e5bba170bf97a..f5cee710a40fc 100644 --- a/src/ctl/src/cmd_impl/table/scan.rs +++ b/src/ctl/src/cmd_impl/table/scan.rs @@ -14,6 +14,8 @@ use anyhow::{anyhow, Result}; use futures::{pin_mut, StreamExt}; +use risingwave_common::bitmap::Bitmap; +use risingwave_common::hash::VirtualNode; use risingwave_frontend::TableCatalog; use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_rpc_client::MetaClient; @@ -63,7 +65,8 @@ pub async fn make_state_table(hummock: S, table: &TableCatalog) - .collect(), table.pk().iter().map(|x| x.order_type).collect(), table.pk().iter().map(|x| x.column_index).collect(), - TableDistribution::all(table.distribution_key().to_vec()), // scan all vnodes + // TODO(var-vnode): use vnode count from table desc + TableDistribution::all(table.distribution_key().to_vec(), VirtualNode::COUNT), // scan all vnodes Some(table.value_indices.clone()), ) .await @@ -81,7 +84,8 @@ pub fn make_storage_table( Ok(StorageTable::new_partial( hummock, output_columns_ids, - Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + Some(Bitmap::ones(VirtualNode::COUNT).into()), &table.table_desc().try_to_protobuf()?, )) } diff --git a/src/frontend/src/scheduler/plan_fragmenter.rs b/src/frontend/src/scheduler/plan_fragmenter.rs index 09e4cbc0bfa03..2ecae1d7f7642 100644 --- a/src/frontend/src/scheduler/plan_fragmenter.rs +++ b/src/frontend/src/scheduler/plan_fragmenter.rs @@ -1250,7 +1250,8 @@ fn derive_partitions( } let table_distribution = TableDistribution::new_from_storage_table_desc( - Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + Some(Bitmap::ones(VirtualNode::COUNT).into()), &table_desc.try_to_protobuf()?, ); diff --git a/src/storage/hummock_test/src/state_store_tests.rs b/src/storage/hummock_test/src/state_store_tests.rs index 35f3d08a9ed8a..67da2150735af 100644 --- a/src/storage/hummock_test/src/state_store_tests.rs +++ b/src/storage/hummock_test/src/state_store_tests.rs @@ -24,7 +24,6 @@ use futures::{pin_mut, StreamExt}; use itertools::Itertools; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{TableId, TableOption}; -use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::{test_epoch, EpochExt, MAX_EPOCH}; use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, TableKeyRange}; @@ -1565,7 +1564,7 @@ async fn test_iter_log() { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; @@ -1580,7 +1579,7 @@ async fn test_iter_log() { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; // flush for about 10 times per epoch diff --git a/src/storage/src/hummock/iterator/change_log.rs b/src/storage/src/hummock/iterator/change_log.rs index 6fc99f29a80f3..ae8061c37b07d 100644 --- a/src/storage/src/hummock/iterator/change_log.rs +++ b/src/storage/src/hummock/iterator/change_log.rs @@ -527,8 +527,9 @@ mod tests { use bytes::Bytes; use itertools::Itertools; + use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; - use risingwave_common::hash::table_distribution::TableDistribution; + use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::test_epoch; use risingwave_hummock_sdk::key::{TableKey, UserKey}; use risingwave_hummock_sdk::EpochWithGap; @@ -699,7 +700,7 @@ mod tests { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; let logs = gen_test_data(epoch_count, 10000, 0.05, 0.2); diff --git a/src/stream/src/common/log_store_impl/kv_log_store/serde.rs b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs index 92a3caf4cd2e3..17ab103d758b4 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/serde.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs @@ -25,7 +25,7 @@ use itertools::Itertools; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::ColumnDesc; -use risingwave_common::hash::VirtualNode; +use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::types::{DataType, ScalarImpl}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -42,7 +42,7 @@ use risingwave_storage::error::StorageResult; use risingwave_storage::row_serde::row_serde_util::{serialize_pk, serialize_pk_with_vnode}; use risingwave_storage::row_serde::value_serde::ValueRowSerdeNew; use risingwave_storage::store::{StateStoreIterExt, StateStoreReadIter}; -use risingwave_storage::table::{compute_vnode, TableDistribution, SINGLETON_VNODE}; +use risingwave_storage::table::{compute_vnode, SINGLETON_VNODE}; use rw_futures_util::select_all; use crate::common::log_store_impl::kv_log_store::{ @@ -201,8 +201,7 @@ impl LogStoreRowSerde { let vnodes = match vnodes { Some(vnodes) => vnodes, - - None => TableDistribution::singleton_vnode_bitmap(), + None => Bitmap::singleton().into(), }; // epoch and seq_id. The seq_id of barrier is set null, and therefore the second order type @@ -216,7 +215,7 @@ impl LogStoreRowSerde { ); let dist_key_indices = if dist_key_indices.is_empty() { - if &vnodes != TableDistribution::singleton_vnode_bitmap_ref() { + if !vnodes.is_singleton() { warn!( ?vnodes, "singleton log store gets non-singleton vnode bitmap" @@ -946,7 +945,7 @@ mod tests { use risingwave_storage::store::{ FromStreamStateStoreIter, StateStoreIterItem, StateStoreReadIter, }; - use risingwave_storage::table::DEFAULT_VNODE; + use risingwave_storage::table::SINGLETON_VNODE; use tokio::sync::oneshot; use tokio::sync::oneshot::Sender; @@ -1024,7 +1023,7 @@ mod tests { seq_id += 1; } - let (key, encoded_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, false); + let (key, encoded_barrier) = serde.serialize_barrier(epoch, SINGLETON_VNODE, false); let key = remove_vnode_prefix(&key.0); match serde.deserialize(&encoded_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { @@ -1062,7 +1061,8 @@ mod tests { seq_id += 1; } - let (key, encoded_checkpoint_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, true); + let (key, encoded_checkpoint_barrier) = + serde.serialize_barrier(epoch, SINGLETON_VNODE, true); let key = remove_vnode_prefix(&key.0); match serde.deserialize(&encoded_checkpoint_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { @@ -1200,7 +1200,7 @@ mod tests { ) { let (ops, rows) = gen_test_data(base); let first_barrier = { - let (key, value) = serde.serialize_barrier(EPOCH0, DEFAULT_VNODE, true); + let (key, value) = serde.serialize_barrier(EPOCH0, SINGLETON_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH0), value)) }; let stream = stream::once(async move { first_barrier }); @@ -1210,7 +1210,7 @@ mod tests { let stream = stream.chain(stream::once({ let serde = serde.clone(); async move { - let (key, value) = serde.serialize_barrier(EPOCH1, DEFAULT_VNODE, false); + let (key, value) = serde.serialize_barrier(EPOCH1, SINGLETON_VNODE, false); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH1), value)) } })); @@ -1218,7 +1218,7 @@ mod tests { gen_row_stream(serde.clone(), ops.clone(), rows.clone(), EPOCH2, seq_id); let stream = stream.chain(row_stream).chain(stream::once({ async move { - let (key, value) = serde.serialize_barrier(EPOCH2, DEFAULT_VNODE, true); + let (key, value) = serde.serialize_barrier(EPOCH2, SINGLETON_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH2), value)) } })); diff --git a/src/stream/src/common/table/test_state_table.rs b/src/stream/src/common/table/test_state_table.rs index 098548c21ac93..dde0d8a581406 100644 --- a/src/stream/src/common/table/test_state_table.rs +++ b/src/stream/src/common/table/test_state_table.rs @@ -27,7 +27,7 @@ use risingwave_common::util::value_encoding::BasicSerde; use risingwave_hummock_test::test_utils::prepare_hummock_test_env; use risingwave_storage::hummock::HummockStorage; use risingwave_storage::store::PrefetchOptions; -use risingwave_storage::table::DEFAULT_VNODE; +use risingwave_storage::table::SINGLETON_VNODE; use crate::common::table::state_table::{ ReplicatedStateTable, StateTable, WatermarkCacheStateTable, @@ -445,7 +445,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::Included(OwnedRow::new(vec![Some(4_i32.into())])), ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -470,7 +470,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::::Unbounded, ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -1976,11 +1976,11 @@ async fn test_replicated_state_table_replication() { std::ops::Bound::Included(OwnedRow::new(vec![Some(2_i32.into())])), ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -2039,7 +2039,7 @@ async fn test_replicated_state_table_replication() { ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); @@ -2048,7 +2048,7 @@ async fn test_replicated_state_table_replication() { std::ops::Bound::Unbounded, ); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -2079,7 +2079,7 @@ async fn test_replicated_state_table_replication() { let range_bounds: (Bound, Bound) = (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(replicated_iter); diff --git a/src/stream/src/executor/watermark_filter.rs b/src/stream/src/executor/watermark_filter.rs index 8f8b166626d21..01497c37fdab5 100644 --- a/src/stream/src/executor/watermark_filter.rs +++ b/src/stream/src/executor/watermark_filter.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::cmp; -use std::ops::Deref; use futures::future::{try_join, try_join_all}; use risingwave_common::hash::VnodeBitmapExt; @@ -27,7 +26,6 @@ use risingwave_expr::Result as ExprResult; use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_pb::expr::expr_node::Type; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use super::filter::FilterExecutor; use crate::executor::prelude::*; @@ -219,10 +217,7 @@ impl WatermarkFilterExecutor { let mut need_update_global_max_watermark = false; // Update the vnode bitmap for state tables of all agg calls if asked. if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(ctx.id) { - let other_vnodes_bitmap = Arc::new( - (!(*vnode_bitmap).clone()) - & TableDistribution::all_vnodes_ref().deref(), - ); + let other_vnodes_bitmap = Arc::new(!(*vnode_bitmap).clone()); let _ = global_watermark_table.update_vnode_bitmap(other_vnodes_bitmap); let (previous_vnode_bitmap, _cache_may_stale) = table.update_vnode_bitmap(vnode_bitmap.clone()); @@ -373,7 +368,9 @@ impl WatermarkFilterExecutor { #[cfg(test)] mod tests { use itertools::Itertools; + use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, TableDesc}; + use risingwave_common::hash::VirtualNode; use risingwave_common::test_prelude::StreamChunkTestExt; use risingwave_common::types::Date; use risingwave_common::util::epoch::test_epoch; @@ -431,7 +428,7 @@ mod tests { let state_table = StateTable::from_table_catalog_inconsistent_op( &table, mem_state.clone(), - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into()), ) .await; @@ -440,7 +437,7 @@ mod tests { let storage_table = StorageTable::new_partial( mem_state, val_indices.iter().map(|i| ColumnId::new(*i as _)).collect(), - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into()), &desc, ); (storage_table, state_table) diff --git a/src/stream/src/from_proto/mview.rs b/src/stream/src/from_proto/mview.rs index 41fc47609fba7..43fc929edf455 100644 --- a/src/stream/src/from_proto/mview.rs +++ b/src/stream/src/from_proto/mview.rs @@ -100,7 +100,7 @@ impl ExecutorBuilder for ArrangeExecutorBuilder { let table = node.get_table()?; // FIXME: Lookup is now implemented without cell-based table API and relies on all vnodes - // being `DEFAULT_VNODE`, so we need to make the Arrange a singleton. + // being `SINGLETON_VNODE`, so we need to make the Arrange a singleton. let vnodes = params.vnode_bitmap.map(Arc::new); let conflict_behavior = ConflictBehavior::from_protobuf(&table.handle_pk_conflict_behavior()); diff --git a/src/stream/src/from_proto/watermark_filter.rs b/src/stream/src/from_proto/watermark_filter.rs index 0081f00cc39e6..4e3147d10853e 100644 --- a/src/stream/src/from_proto/watermark_filter.rs +++ b/src/stream/src/from_proto/watermark_filter.rs @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::Deref; use std::sync::Arc; use risingwave_common::catalog::{ColumnId, TableDesc}; use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::WatermarkFilterNode; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use super::*; use crate::common::table::state_table::StateTable; @@ -57,8 +55,7 @@ impl ExecutorBuilder for WatermarkFilterBuilder { .iter() .map(|i| ColumnId::new(*i as _)) .collect_vec(); - let other_vnodes = - Arc::new((!(*vnodes).clone()) & TableDistribution::all_vnodes_ref().deref()); + let other_vnodes = Arc::new(!(*vnodes).clone()); let global_watermark_table = StorageTable::new_partial(store.clone(), column_ids, Some(other_vnodes), &desc); From 3b98b71f27525ad936820c264198c0f02334d252 Mon Sep 17 00:00:00 2001 From: xiangjinwu <17769960+xiangjinwu@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:28:27 +0800 Subject: [PATCH 02/32] fix(source): Protobuf `Any` as canonical JSON (#18380) --- Cargo.lock | 3 + src/connector/Cargo.toml | 2 +- src/connector/codec/src/decoder/mod.rs | 3 + src/connector/src/parser/protobuf/parser.rs | 310 ++++--------------- src/connector/src/parser/unified/mod.rs | 4 +- src/connector/src/parser/unified/protobuf.rs | 16 +- 6 files changed, 79 insertions(+), 259 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c1b1ec57fdece..4e648e08a3fea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9216,9 +9216,12 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55a6a9143ae25c25fa7b6a48d6cc08b10785372060009c25140a4e7c340e95af" dependencies = [ + "base64 0.22.0", "once_cell", "prost 0.13.1", "prost-types 0.13.1", + "serde", + "serde-value", ] [[package]] diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml index d87e89c1cf65d..a77e9cb929d17 100644 --- a/src/connector/Cargo.toml +++ b/src/connector/Cargo.toml @@ -103,7 +103,7 @@ pg_bigdecimal = { git = "https://github.com/risingwavelabs/rust-pg_bigdecimal", postgres-openssl = "0.5.0" prometheus = { version = "0.13", features = ["process"] } prost = { workspace = true, features = ["no-recursion-limit"] } -prost-reflect = "0.14" +prost-reflect = { version = "0.14", features = ["serde"] } prost-types = "0.13" protobuf-native = "0.2.2" pulsar = { version = "6.3", default-features = false, features = [ diff --git a/src/connector/codec/src/decoder/mod.rs b/src/connector/codec/src/decoder/mod.rs index 814e06a166c6c..bbfdbf0a90d79 100644 --- a/src/connector/codec/src/decoder/mod.rs +++ b/src/connector/codec/src/decoder/mod.rs @@ -38,6 +38,9 @@ pub enum AccessError { #[error("Unsupported additional column `{name}`")] UnsupportedAdditionalColumn { name: String }, + #[error("Fail to convert protobuf Any into jsonb: {0}")] + ProtobufAnyToJson(#[source] serde_json::Error), + /// Errors that are not categorized into variants above. #[error("{message}")] Uncategorized { message: String }, diff --git a/src/connector/src/parser/protobuf/parser.rs b/src/connector/src/parser/protobuf/parser.rs index 8be25074f6295..ec8c747cafd5a 100644 --- a/src/connector/src/parser/protobuf/parser.rs +++ b/src/connector/src/parser/protobuf/parser.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anyhow::Context; use itertools::Itertools; use prost_reflect::{ @@ -22,8 +20,7 @@ use prost_reflect::{ }; use risingwave_common::array::{ListValue, StructValue}; use risingwave_common::types::{ - DataType, Datum, DatumCow, Decimal, JsonbRef, JsonbVal, ScalarImpl, ScalarRefImpl, ToDatumRef, - ToOwnedDatum, F32, F64, + DataType, DatumCow, Decimal, JsonbVal, ScalarImpl, ToOwnedDatum, F32, F64, }; use risingwave_common::{bail, try_match_expand}; use risingwave_pb::plan_common::{AdditionalColumn, ColumnDesc, ColumnDescVersion}; @@ -32,9 +29,7 @@ use thiserror_ext::{AsReport, Macro}; use crate::error::ConnectorResult; use crate::parser::unified::protobuf::ProtobufAccess; -use crate::parser::unified::{ - bail_uncategorized, uncategorized, AccessError, AccessImpl, AccessResult, -}; +use crate::parser::unified::{uncategorized, AccessError, AccessImpl, AccessResult}; use crate::parser::util::bytes_from_url; use crate::parser::{AccessBuilder, EncodingProperties}; use crate::schema::schema_registry::{extract_schema_id, handle_sr_list, Client, WireFormatError}; @@ -44,7 +39,6 @@ use crate::schema::SchemaLoader; pub struct ProtobufAccessBuilder { confluent_wire_type: bool, message_descriptor: MessageDescriptor, - descriptor_pool: Arc, } impl AccessBuilder for ProtobufAccessBuilder { @@ -59,10 +53,7 @@ impl AccessBuilder for ProtobufAccessBuilder { let message = DynamicMessage::decode(self.message_descriptor.clone(), payload) .context("failed to parse message")?; - Ok(AccessImpl::Protobuf(ProtobufAccess::new( - message, - Arc::clone(&self.descriptor_pool), - ))) + Ok(AccessImpl::Protobuf(ProtobufAccess::new(message))) } } @@ -71,13 +62,11 @@ impl ProtobufAccessBuilder { let ProtobufParserConfig { confluent_wire_type, message_descriptor, - descriptor_pool, } = config; Ok(Self { confluent_wire_type, message_descriptor, - descriptor_pool, }) } } @@ -86,8 +75,6 @@ impl ProtobufAccessBuilder { pub struct ProtobufParserConfig { confluent_wire_type: bool, pub(crate) message_descriptor: MessageDescriptor, - /// Note that the pub(crate) here is merely for testing - pub(crate) descriptor_pool: Arc, } impl ProtobufParserConfig { @@ -132,7 +119,6 @@ impl ProtobufParserConfig { Ok(Self { message_descriptor, confluent_wire_type: protobuf_config.use_schema_registry, - descriptor_pool: Arc::new(pool), }) } @@ -216,141 +202,9 @@ fn detect_loop_and_push( Ok(()) } -fn extract_any_info(dyn_msg: &DynamicMessage) -> (String, Value) { - debug_assert!( - dyn_msg.fields().count() == 2, - "Expected only two fields for Any Type MessageDescriptor" - ); - - let type_url = dyn_msg - .get_field_by_name("type_url") - .expect("Expect type_url in dyn_msg") - .to_string() - .split('/') - .nth(1) - .map(|part| part[..part.len() - 1].to_string()) - .unwrap_or_default(); - - let payload = dyn_msg - .get_field_by_name("value") - .expect("Expect value (payload) in dyn_msg") - .as_ref() - .clone(); - - (type_url, payload) -} - -/// TODO: Resolve the potential naming conflict in the map -/// i.e., If the two anonymous type shares the same key (e.g., "Int32"), -/// the latter will overwrite the former one in `serde_json::Map`. -/// Possible solution, maintaining a global id map, for the same types -/// In the same level of fields, add the unique id at the tail of the name. -/// e.g., "Int32.1" & "Int32.2" in the above example -fn recursive_parse_json( - fields: &[Datum], - full_name_vec: Option>, - full_name: Option, -) -> serde_json::Value { - // Note that the key is of no order - let mut ret: serde_json::Map = serde_json::Map::new(); - - // The hidden type hint for user's convenience - // i.e., `"_type": message.full_name()` - if let Some(full_name) = full_name { - ret.insert("_type".to_string(), serde_json::Value::String(full_name)); - } - - for (idx, field) in fields.iter().enumerate() { - let mut key; - if let Some(k) = full_name_vec.as_ref() { - key = k[idx].to_string(); - } else { - key = "".to_string(); - } - - match field.clone() { - Some(ScalarImpl::Int16(v)) => { - if key.is_empty() { - key = "Int16".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Int32(v)) => { - if key.is_empty() { - key = "Int32".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Int64(v)) => { - if key.is_empty() { - key = "Int64".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Bool(v)) => { - if key.is_empty() { - key = "Bool".to_string(); - } - ret.insert(key, serde_json::Value::Bool(v)); - } - Some(ScalarImpl::Bytea(v)) => { - if key.is_empty() { - key = "Bytea".to_string(); - } - let s = String::from_utf8(v.to_vec()).unwrap(); - ret.insert(key, serde_json::Value::String(s)); - } - Some(ScalarImpl::Float32(v)) => { - if key.is_empty() { - key = "Int16".to_string(); - } - ret.insert( - key, - serde_json::Value::Number( - serde_json::Number::from_f64(v.into_inner() as f64).unwrap(), - ), - ); - } - Some(ScalarImpl::Float64(v)) => { - if key.is_empty() { - key = "Float64".to_string(); - } - ret.insert( - key, - serde_json::Value::Number( - serde_json::Number::from_f64(v.into_inner()).unwrap(), - ), - ); - } - Some(ScalarImpl::Utf8(v)) => { - if key.is_empty() { - key = "Utf8".to_string(); - } - ret.insert(key, serde_json::Value::String(v.to_string())); - } - Some(ScalarImpl::Struct(v)) => { - if key.is_empty() { - key = "Struct".to_string(); - } - ret.insert(key, recursive_parse_json(v.fields(), None, None)); - } - Some(ScalarImpl::Jsonb(v)) => { - if key.is_empty() { - key = "Jsonb".to_string(); - } - ret.insert(key, v.take()); - } - r#type => panic!("Not yet support ScalarImpl type: {:?}", r#type), - } - } - - serde_json::Value::Object(ret) -} - pub fn from_protobuf_value<'a>( field_desc: &FieldDescriptor, value: &'a Value, - descriptor_pool: &Arc, ) -> AccessResult> { let kind = field_desc.kind(); @@ -382,62 +236,9 @@ pub fn from_protobuf_value<'a>( } Value::Message(dyn_msg) => { if dyn_msg.descriptor().full_name() == "google.protobuf.Any" { - // If the fields are not presented, default value is an empty string - if !dyn_msg.has_field_by_name("type_url") || !dyn_msg.has_field_by_name("value") { - borrowed!(JsonbRef::empty_string()); - } - - // Sanity check - debug_assert!( - dyn_msg.has_field_by_name("type_url") && dyn_msg.has_field_by_name("value"), - "`type_url` & `value` must exist in fields of `dyn_msg`" - ); - - // The message is of type `Any` - let (type_url, payload) = extract_any_info(dyn_msg); - - let payload_field_desc = dyn_msg.descriptor().get_field_by_name("value").unwrap(); - - let payload = from_protobuf_value(&payload_field_desc, &payload, descriptor_pool)?; - let Some(ScalarRefImpl::Bytea(payload)) = payload.to_datum_ref() else { - bail_uncategorized!("expected bytes for dynamic message payload"); - }; - - // Get the corresponding schema from the descriptor pool - let msg_desc = descriptor_pool - .get_message_by_name(&type_url) - .ok_or_else(|| { - uncategorized!("message `{type_url}` not found in descriptor pool") - })?; - - let f = msg_desc - .clone() - .fields() - .map(|f| f.name().to_string()) - .collect::>(); - - let full_name = msg_desc.clone().full_name().to_string(); - - // Decode the payload based on the `msg_desc` - let decoded_value = DynamicMessage::decode(msg_desc, payload).unwrap(); - let decoded_value = from_protobuf_value( - field_desc, - &Value::Message(decoded_value), - descriptor_pool, - )? - .to_owned_datum() - .unwrap(); - - // Extract the struct value - let ScalarImpl::Struct(v) = decoded_value else { - panic!("Expect ScalarImpl::Struct"); - }; - - ScalarImpl::Jsonb(JsonbVal::from(serde_json::json!(recursive_parse_json( - v.fields(), - Some(f), - Some(full_name), - )))) + ScalarImpl::Jsonb(JsonbVal::from( + serde_json::to_value(dyn_msg).map_err(AccessError::ProtobufAnyToJson)?, + )) } else { let mut rw_values = Vec::with_capacity(dyn_msg.descriptor().fields().len()); // fields is a btree map in descriptor @@ -454,9 +255,7 @@ pub fn from_protobuf_value<'a>( } // use default value if dyn_msg doesn't has this field let value = dyn_msg.get_field(&field_desc); - rw_values.push( - from_protobuf_value(&field_desc, &value, descriptor_pool)?.to_owned_datum(), - ); + rw_values.push(from_protobuf_value(&field_desc, &value)?.to_owned_datum()); } ScalarImpl::Struct(StructValue::new(rw_values)) } @@ -466,7 +265,7 @@ pub fn from_protobuf_value<'a>( .map_err(|e| uncategorized!("{}", e.to_report_string()))?; let mut builder = data_type.as_list().create_array_builder(values.len()); for value in values { - builder.append(from_protobuf_value(field_desc, value, descriptor_pool)?); + builder.append(from_protobuf_value(field_desc, value)?); } ScalarImpl::List(ListValue::new(builder.finish())) } @@ -498,25 +297,18 @@ fn protobuf_type_mapping( } Kind::Uint64 | Kind::Fixed64 => DataType::Decimal, Kind::String => DataType::Varchar, - Kind::Message(m) => { - let fields = m - .fields() - .map(|f| protobuf_type_mapping(&f, parse_trace)) - .try_collect()?; - let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); - - // Note that this part is useful for actual parsing - // Since RisingWave will parse message to `ScalarImpl::Jsonb` - // Please do NOT modify it - if field_names.len() == 2 - && field_names.contains(&"value".to_string()) - && field_names.contains(&"type_url".to_string()) - { - DataType::Jsonb - } else { + Kind::Message(m) => match m.full_name() { + // Well-Known Types are identified by their full name + "google.protobuf.Any" => DataType::Jsonb, + _ => { + let fields = m + .fields() + .map(|f| protobuf_type_mapping(&f, parse_trace)) + .try_collect()?; + let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); DataType::new_struct(fields, field_names) } - } + }, Kind::Enum(_) => DataType::Varchar, Kind::Bytes => DataType::Bytea, }; @@ -973,10 +765,9 @@ mod test { // This is of no use let field = value.fields().next().unwrap().0; - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() + if let Some(ret) = from_protobuf_value(&field, &Value::Message(value)) + .unwrap() + .to_owned_datum() { println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); println!("---------------------------"); @@ -1000,7 +791,7 @@ mod test { assert_eq!( jv, JsonbVal::from(json!({ - "_type": "test.StringValue", + "@type": "type.googleapis.com/test.StringValue", "value": "John Doe" })) ); @@ -1036,10 +827,9 @@ mod test { // This is of no use let field = value.fields().next().unwrap().0; - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() + if let Some(ret) = from_protobuf_value(&field, &Value::Message(value)) + .unwrap() + .to_owned_datum() { println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); println!("---------------------------"); @@ -1063,7 +853,7 @@ mod test { assert_eq!( jv, JsonbVal::from(json!({ - "_type": "test.Int32Value", + "@type": "type.googleapis.com/test.Int32Value", "value": 114514 })) ); @@ -1110,10 +900,9 @@ mod test { // This is of no use let field = value.fields().next().unwrap().0; - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() + if let Some(ret) = from_protobuf_value(&field, &Value::Message(value)) + .unwrap() + .to_owned_datum() { println!("Decoded Value for ANY_RECURSIVE_GEN_PROTO_DATA: {:#?}", ret); println!("---------------------------"); @@ -1137,13 +926,13 @@ mod test { assert_eq!( jv, JsonbVal::from(json!({ - "_type": "test.AnyValue", - "any_value_1": { - "_type": "test.StringValue", + "@type": "type.googleapis.com/test.AnyValue", + "anyValue1": { + "@type": "type.googleapis.com/test.StringValue", "value": "114514", }, - "any_value_2": { - "_type": "test.Int32Value", + "anyValue2": { + "@type": "type.googleapis.com/test.Int32Value", "value": 114514, } })) @@ -1156,6 +945,37 @@ mod test { Ok(()) } + // id: 12345 + // any_value: { + // type_url: "type.googleapis.com/test.StringXalue" + // value: "\n\010John Doe" + // } + static ANY_GEN_PROTO_DATA_INVALID: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x58\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; + + #[tokio::test] + async fn test_any_invalid() -> crate::error::ConnectorResult<()> { + let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; + + let value = + DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_INVALID) + .unwrap(); + + // The top-level `Value` is not a proto field, but we need a dummy one. + let field = value.fields().next().unwrap().0; + + let err = from_protobuf_value(&field, &Value::Message(value)).unwrap_err(); + + let expected = expect_test::expect![[r#" + Fail to convert protobuf Any into jsonb + + Caused by: + message 'test.StringXalue' not found + "#]]; + expected.assert_eq(err.to_report_string_pretty().as_str()); + + Ok(()) + } + #[test] fn test_decode_varint_zigzag() { // 1. Positive number diff --git a/src/connector/src/parser/unified/mod.rs b/src/connector/src/parser/unified/mod.rs index 8045ce0132401..fdfe3aae6aaee 100644 --- a/src/connector/src/parser/unified/mod.rs +++ b/src/connector/src/parser/unified/mod.rs @@ -17,9 +17,7 @@ use auto_impl::auto_impl; use risingwave_common::types::{DataType, DatumCow}; use risingwave_connector_codec::decoder::avro::AvroAccess; -pub use risingwave_connector_codec::decoder::{ - bail_uncategorized, uncategorized, Access, AccessError, AccessResult, -}; +pub use risingwave_connector_codec::decoder::{uncategorized, Access, AccessError, AccessResult}; use self::bytes::BytesAccess; use self::json::JsonAccess; diff --git a/src/connector/src/parser/unified/protobuf.rs b/src/connector/src/parser/unified/protobuf.rs index 02febc22db247..b1d34746b5029 100644 --- a/src/connector/src/parser/unified/protobuf.rs +++ b/src/connector/src/parser/unified/protobuf.rs @@ -13,9 +13,9 @@ // limitations under the License. use std::borrow::Cow; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; -use prost_reflect::{DescriptorPool, DynamicMessage, ReflectMessage}; +use prost_reflect::{DynamicMessage, ReflectMessage}; use risingwave_common::log::LogSuppresser; use risingwave_common::types::{DataType, DatumCow, ToOwnedDatum}; use thiserror_ext::AsReport; @@ -26,15 +26,11 @@ use crate::parser::unified::uncategorized; pub struct ProtobufAccess { message: DynamicMessage, - descriptor_pool: Arc, } impl ProtobufAccess { - pub fn new(message: DynamicMessage, descriptor_pool: Arc) -> Self { - Self { - message, - descriptor_pool, - } + pub fn new(message: DynamicMessage) -> Self { + Self { message } } } @@ -59,10 +55,10 @@ impl Access for ProtobufAccess { })?; match self.message.get_field(&field_desc) { - Cow::Borrowed(value) => from_protobuf_value(&field_desc, value, &self.descriptor_pool), + Cow::Borrowed(value) => from_protobuf_value(&field_desc, value), // `Owned` variant occurs only if there's no such field and the default value is returned. - Cow::Owned(value) => from_protobuf_value(&field_desc, &value, &self.descriptor_pool) + Cow::Owned(value) => from_protobuf_value(&field_desc, &value) // enforce `Owned` variant to avoid returning a reference to a temporary value .map(|d| d.to_owned_datum().into()), } From f2f58272cc51a118a83c1fce02bd42afeb98c80b Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:02:08 +0800 Subject: [PATCH 03/32] fix(storage): fix assertion (#18413) --- src/storage/hummock_sdk/src/sstable_info.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/storage/hummock_sdk/src/sstable_info.rs b/src/storage/hummock_sdk/src/sstable_info.rs index 2f64508e57314..9970c60f506c8 100644 --- a/src/storage/hummock_sdk/src/sstable_info.rs +++ b/src/storage/hummock_sdk/src/sstable_info.rs @@ -136,7 +136,7 @@ impl From<&PbSstableInfo> for SstableInfo { impl From for PbSstableInfo { fn from(sstable_info: SstableInfo) -> Self { - assert_ne!(0, sstable_info.sst_size); + assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -174,7 +174,7 @@ impl From for PbSstableInfo { impl From<&SstableInfo> for PbSstableInfo { fn from(sstable_info: &SstableInfo) -> Self { - assert_ne!(0, sstable_info.sst_size); + assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -212,3 +212,10 @@ impl SstableInfo { self.key_range = KeyRange::default(); } } + +// Time travel +impl SstableInfo { + pub fn is_stripped(&self) -> bool { + self.object_id == 0 + } +} From cb29fe0a3221638b5dc62c3ea543b08dd567135d Mon Sep 17 00:00:00 2001 From: lmatz Date: Thu, 5 Sep 2024 15:23:40 +0800 Subject: [PATCH 04/32] feat: support more SSL related configurations in Kafka connector (#18361) --- src/connector/src/connector_common/common.rs | 21 ++++++++++++++++++++ src/connector/with_options_sink.yaml | 12 +++++++++++ src/connector/with_options_source.yaml | 12 +++++++++++ 3 files changed, 45 insertions(+) diff --git a/src/connector/src/connector_common/common.rs b/src/connector/src/connector_common/common.rs index b522ae2eda560..9f4211aedd4d9 100644 --- a/src/connector/src/connector_common/common.rs +++ b/src/connector/src/connector_common/common.rs @@ -192,14 +192,26 @@ pub struct KafkaCommon { #[serde(rename = "properties.ssl.ca.location")] ssl_ca_location: Option, + /// CA certificate string (PEM format) for verifying the broker's key. + #[serde(rename = "properties.ssl.ca.pem")] + ssl_ca_pem: Option, + /// Path to client's certificate file (PEM). #[serde(rename = "properties.ssl.certificate.location")] ssl_certificate_location: Option, + /// Client's public key string (PEM format) used for authentication. + #[serde(rename = "properties.ssl.certificate.pem")] + ssl_certificate_pem: Option, + /// Path to client's private key file (PEM). #[serde(rename = "properties.ssl.key.location")] ssl_key_location: Option, + /// Client's private key string (PEM format) used for authentication. + #[serde(rename = "properties.ssl.key.pem")] + ssl_key_pem: Option, + /// Passphrase of client's private key. #[serde(rename = "properties.ssl.key.password")] ssl_key_password: Option, @@ -325,12 +337,21 @@ impl KafkaCommon { if let Some(ssl_ca_location) = self.ssl_ca_location.as_ref() { config.set("ssl.ca.location", ssl_ca_location); } + if let Some(ssl_ca_pem) = self.ssl_ca_pem.as_ref() { + config.set("ssl.ca.pem", ssl_ca_pem); + } if let Some(ssl_certificate_location) = self.ssl_certificate_location.as_ref() { config.set("ssl.certificate.location", ssl_certificate_location); } + if let Some(ssl_certificate_pem) = self.ssl_certificate_pem.as_ref() { + config.set("ssl.certificate.pem", ssl_certificate_pem); + } if let Some(ssl_key_location) = self.ssl_key_location.as_ref() { config.set("ssl.key.location", ssl_key_location); } + if let Some(ssl_key_pem) = self.ssl_key_pem.as_ref() { + config.set("ssl.key.pem", ssl_key_pem); + } if let Some(ssl_key_password) = self.ssl_key_password.as_ref() { config.set("ssl.key.password", ssl_key_password); } diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index cc92f9a0a664a..e8a8efff68801 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -373,14 +373,26 @@ KafkaConfig: field_type: String comments: Path to CA certificate file for verifying the broker's key. required: false + - name: properties.ssl.ca.pem + field_type: String + comments: CA certificate string (PEM format) for verifying the broker's key. + required: false - name: properties.ssl.certificate.location field_type: String comments: Path to client's certificate file (PEM). required: false + - name: properties.ssl.certificate.pem + field_type: String + comments: Client's public key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.location field_type: String comments: Path to client's private key file (PEM). required: false + - name: properties.ssl.key.pem + field_type: String + comments: Client's private key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.password field_type: String comments: Passphrase of client's private key. diff --git a/src/connector/with_options_source.yaml b/src/connector/with_options_source.yaml index 4eaf1e0d3db4b..a6a19e80c89a3 100644 --- a/src/connector/with_options_source.yaml +++ b/src/connector/with_options_source.yaml @@ -199,14 +199,26 @@ KafkaProperties: field_type: String comments: Path to CA certificate file for verifying the broker's key. required: false + - name: properties.ssl.ca.pem + field_type: String + comments: CA certificate string (PEM format) for verifying the broker's key. + required: false - name: properties.ssl.certificate.location field_type: String comments: Path to client's certificate file (PEM). required: false + - name: properties.ssl.certificate.pem + field_type: String + comments: Client's public key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.location field_type: String comments: Path to client's private key file (PEM). required: false + - name: properties.ssl.key.pem + field_type: String + comments: Client's private key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.password field_type: String comments: Passphrase of client's private key. From 8a32a9b21b45c4d91f3235411d35a0a2b3f5102a Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Thu, 5 Sep 2024 16:37:54 +0800 Subject: [PATCH 05/32] feat: variable vnode count support in vnode structure (#18381) Signed-off-by: Bugen Zhao --- .../task/consistent_hash_shuffle_channel.rs | 1 + .../src/hash/consistent_hash/mapping.rs | 42 +++++----- src/common/src/hash/consistent_hash/vnode.rs | 84 ++++++++++++------- src/common/src/hash/table_distribution.rs | 4 +- src/common/src/util/row_id.rs | 1 + src/common/src/util/scan_range.rs | 4 +- .../src/vnode_mapping/vnode_placement.rs | 2 +- src/expr/impl/src/scalar/vnode.rs | 6 +- src/meta/src/controller/fragment.rs | 16 ++-- src/meta/src/stream/stream_graph/schedule.rs | 5 +- src/meta/src/stream/stream_manager.rs | 17 ++-- src/meta/src/stream/test_scale.rs | 26 +++--- .../log_store_impl/kv_log_store/test_utils.rs | 8 +- src/stream/src/executor/dispatch.rs | 3 +- 14 files changed, 132 insertions(+), 87 deletions(-) diff --git a/src/batch/src/task/consistent_hash_shuffle_channel.rs b/src/batch/src/task/consistent_hash_shuffle_channel.rs index ad0fdbaa8b70a..32d91a7acc09b 100644 --- a/src/batch/src/task/consistent_hash_shuffle_channel.rs +++ b/src/batch/src/task/consistent_hash_shuffle_channel.rs @@ -59,6 +59,7 @@ fn generate_hash_values( .iter() .map(|idx| *idx as usize) .collect::>(), + consistent_hash_info.vmap.len(), ); let hash_values = vnodes diff --git a/src/common/src/hash/consistent_hash/mapping.rs b/src/common/src/hash/consistent_hash/mapping.rs index a462acb291853..0ab8f9e18fd2e 100644 --- a/src/common/src/hash/consistent_hash/mapping.rs +++ b/src/common/src/hash/consistent_hash/mapping.rs @@ -105,26 +105,26 @@ impl VnodeMapping { /// /// For example, if `items` is `[0, 1, 2]`, and the total vnode count is 10, we'll generate /// mapping like `[0, 0, 0, 0, 1, 1, 1, 2, 2, 2]`. - pub fn new_uniform(items: impl ExactSizeIterator) -> Self { + pub fn new_uniform(items: impl ExactSizeIterator, vnode_count: usize) -> Self { // If the number of items is greater than the total vnode count, no vnode will be mapped to // some items and the mapping will be invalid. - assert!(items.len() <= VirtualNode::COUNT); + assert!(items.len() <= vnode_count); let mut original_indices = Vec::with_capacity(items.len()); let mut data = Vec::with_capacity(items.len()); - let hash_shard_size = VirtualNode::COUNT / items.len(); - let mut one_more_count = VirtualNode::COUNT % items.len(); + let hash_shard_size = vnode_count / items.len(); + let mut one_more_count = vnode_count % items.len(); let mut init_bound = 0; for item in items { - let vnode_count = if one_more_count > 0 { + let count = if one_more_count > 0 { one_more_count -= 1; hash_shard_size + 1 } else { hash_shard_size }; - init_bound += vnode_count; + init_bound += count; original_indices.push(init_bound as u32 - 1); data.push(item); @@ -141,10 +141,11 @@ impl VnodeMapping { /// Create a vnode mapping where all vnodes are mapped to the same single item. pub fn new_single(item: T::Item) -> Self { - Self::new_uniform(std::iter::once(item)) + // TODO(var-vnode): always 1 correct? + Self::new_uniform(std::iter::once(item), 1) } - /// The length of the vnode in this mapping, typically [`VirtualNode::COUNT`]. + /// The length (or count) of the vnode in this mapping. pub fn len(&self) -> usize { self.original_indices .last() @@ -204,12 +205,13 @@ impl VnodeMapping { /// Convert this vnode mapping to a mapping from items to bitmaps, where each bitmap represents /// the vnodes mapped to the item. pub fn to_bitmaps(&self) -> HashMap { + let vnode_count = self.len(); let mut vnode_bitmaps = HashMap::new(); for (vnode, item) in self.iter_with_vnode() { vnode_bitmaps .entry(item) - .or_insert_with(|| BitmapBuilder::zeroed(VirtualNode::COUNT)) + .or_insert_with(|| BitmapBuilder::zeroed(vnode_count)) .set(vnode.to_index(), true); } @@ -222,10 +224,11 @@ impl VnodeMapping { /// Create a vnode mapping from the given mapping from items to bitmaps, where each bitmap /// represents the vnodes mapped to the item. pub fn from_bitmaps(bitmaps: &HashMap) -> Self { - let mut items = vec![None; VirtualNode::COUNT]; + let vnode_count = bitmaps.values().next().expect("empty bitmaps").len(); + let mut items = vec![None; vnode_count]; for (&item, bitmap) in bitmaps { - assert_eq!(bitmap.len(), VirtualNode::COUNT); + assert_eq!(bitmap.len(), vnode_count); for idx in bitmap.iter_ones() { if let Some(prev) = items[idx].replace(item) { panic!("mapping at index `{idx}` is set to both `{prev:?}` and `{item:?}`"); @@ -241,9 +244,8 @@ impl VnodeMapping { Self::from_expanded(&items) } - /// Create a vnode mapping from the expanded slice of items with length [`VirtualNode::COUNT`]. + /// Create a vnode mapping from the expanded slice of items. pub fn from_expanded(items: &[T::Item]) -> Self { - assert_eq!(items.len(), VirtualNode::COUNT); let (original_indices, data) = compress_data(items); Self { original_indices, @@ -251,7 +253,7 @@ impl VnodeMapping { } } - /// Convert this vnode mapping to a expanded vector of items with length [`VirtualNode::COUNT`]. + /// Convert this vnode mapping to a expanded vector of items. pub fn to_expanded(&self) -> ExpandedMapping { self.iter().collect() } @@ -353,8 +355,8 @@ impl ActorMapping { impl WorkerSlotMapping { /// Create a uniform worker mapping from the given worker ids - pub fn build_from_ids(worker_slot_ids: &[WorkerSlotId]) -> Self { - Self::new_uniform(worker_slot_ids.iter().cloned()) + pub fn build_from_ids(worker_slot_ids: &[WorkerSlotId], vnode_count: usize) -> Self { + Self::new_uniform(worker_slot_ids.iter().cloned(), vnode_count) } /// Create a worker mapping from the protobuf representation. @@ -403,18 +405,18 @@ mod tests { type TestMapping = VnodeMapping; type Test2Mapping = VnodeMapping; - const COUNTS: &[usize] = &[1, 3, 12, 42, VirtualNode::COUNT]; + const COUNTS: &[usize] = &[1, 3, 12, 42, VirtualNode::COUNT_FOR_TEST]; fn uniforms() -> impl Iterator { COUNTS .iter() - .map(|&count| TestMapping::new_uniform(0..count as u32)) + .map(|&count| TestMapping::new_uniform(0..count as u32, VirtualNode::COUNT_FOR_TEST)) } fn randoms() -> impl Iterator { COUNTS.iter().map(|&count| { let raw = repeat_with(|| rand::thread_rng().gen_range(0..count as u32)) - .take(VirtualNode::COUNT) + .take(VirtualNode::COUNT_FOR_TEST) .collect_vec(); TestMapping::from_expanded(&raw) }) @@ -427,7 +429,7 @@ mod tests { #[test] fn test_uniform() { for vnode_mapping in uniforms() { - assert_eq!(vnode_mapping.len(), VirtualNode::COUNT); + assert_eq!(vnode_mapping.len(), VirtualNode::COUNT_FOR_TEST); let item_count = vnode_mapping.iter_unique().count(); let mut check: HashMap> = HashMap::new(); diff --git a/src/common/src/hash/consistent_hash/vnode.rs b/src/common/src/hash/consistent_hash/vnode.rs index dd4095535fdf3..685f99d6cf4f4 100644 --- a/src/common/src/hash/consistent_hash/vnode.rs +++ b/src/common/src/hash/consistent_hash/vnode.rs @@ -30,26 +30,45 @@ use crate::util::row_id::extract_vnode_id_from_row_id; pub struct VirtualNode(VirtualNodeInner); /// The internal representation of a virtual node id. +/// +/// Note: not all bits of the inner representation might be used. type VirtualNodeInner = u16; -static_assertions::const_assert!(VirtualNodeInner::BITS >= VirtualNode::BITS as u32); -impl From for VirtualNode { - fn from(hash_code: Crc32HashCode) -> Self { +/// `vnode_count` must be provided to convert a hash code to a virtual node. +/// +/// Use [`Crc32HashCodeToVnodeExt::to_vnode`] instead. +impl !From for VirtualNode {} + +#[easy_ext::ext(Crc32HashCodeToVnodeExt)] +impl Crc32HashCode { + /// Converts the hash code to a virtual node, based on the given total count of vnodes. + fn to_vnode(self, vnode_count: usize) -> VirtualNode { // Take the least significant bits of the hash code. // TODO: should we use the most significant bits? - let inner = (hash_code.value() % Self::COUNT as u64) as VirtualNodeInner; + let inner = (self.value() % vnode_count as u64) as VirtualNodeInner; VirtualNode(inner) } } impl VirtualNode { - /// The number of bits used to represent a virtual node. - /// - /// Note: Not all bits of the inner representation are used. One should rely on this constant - /// to determine the count of virtual nodes. - pub const BITS: usize = 8; /// The total count of virtual nodes. - pub const COUNT: usize = 1 << Self::BITS; + // TODO(var-vnode): remove this and only keep `COUNT_FOR_TEST` + pub const COUNT: usize = 1 << 8; + /// The maximum value of the virtual node. + // TODO(var-vnode): remove this and only keep `MAX_FOR_TEST` + pub const MAX: VirtualNode = VirtualNode::from_index(Self::COUNT - 1); +} + +impl VirtualNode { + /// The total count of virtual nodes, for testing purposes. + pub const COUNT_FOR_TEST: usize = Self::COUNT; + /// The maximum value of the virtual node, for testing purposes. + pub const MAX_FOR_TEST: VirtualNode = Self::MAX; +} + +impl VirtualNode { + /// The maximum count of virtual nodes that fits in [`VirtualNodeInner`]. + pub const MAX_COUNT: usize = 1 << VirtualNodeInner::BITS; /// The size of a virtual node in bytes, in memory or serialized representation. pub const SIZE: usize = std::mem::size_of::(); } @@ -58,8 +77,6 @@ impl VirtualNode { pub type AllVirtualNodeIter = std::iter::Map, fn(usize) -> VirtualNode>; impl VirtualNode { - /// The maximum value of the virtual node. - pub const MAX: VirtualNode = VirtualNode::from_index(Self::COUNT - 1); /// We may use `VirtualNode` as a datum in a stream, or store it as a column. /// Hence this reifies it as a RW datatype. pub const RW_TYPE: DataType = DataType::Int16; @@ -68,7 +85,7 @@ impl VirtualNode { /// Creates a virtual node from the `usize` index. pub const fn from_index(index: usize) -> Self { - debug_assert!(index < Self::COUNT); + debug_assert!(index < Self::MAX_COUNT); Self(index as _) } @@ -79,7 +96,6 @@ impl VirtualNode { /// Creates a virtual node from the given scalar representation. Used by `VNODE` expression. pub const fn from_scalar(scalar: i16) -> Self { - debug_assert!((scalar as usize) < Self::COUNT); Self(scalar as _) } @@ -99,7 +115,6 @@ impl VirtualNode { /// Creates a virtual node from the given big-endian bytes representation. pub const fn from_be_bytes(bytes: [u8; Self::SIZE]) -> Self { let inner = VirtualNodeInner::from_be_bytes(bytes); - debug_assert!((inner as usize) < Self::COUNT); Self(inner) } @@ -109,22 +124,21 @@ impl VirtualNode { } /// Iterates over all virtual nodes. - pub fn all() -> AllVirtualNodeIter { - (0..Self::COUNT).map(Self::from_index) + pub fn all(vnode_count: usize) -> AllVirtualNodeIter { + (0..vnode_count).map(Self::from_index) } } -impl VirtualNode { - pub const COUNT_FOR_TEST: usize = Self::COUNT; - pub const MAX_FOR_TEST: VirtualNode = Self::MAX; -} - impl VirtualNode { // `compute_chunk` is used to calculate the `VirtualNode` for the columns in the // chunk. When only one column is provided and its type is `Serial`, we consider the column to // be the one that contains RowId, and use a special method to skip the calculation of Hash // and directly extract the `VirtualNode` from `RowId`. - pub fn compute_chunk(data_chunk: &DataChunk, keys: &[usize]) -> Vec { + pub fn compute_chunk( + data_chunk: &DataChunk, + keys: &[usize], + vnode_count: usize, + ) -> Vec { if let Ok(idx) = keys.iter().exactly_one() && let ArrayImpl::Serial(serial_array) = &**data_chunk.column_at(*idx) { @@ -140,7 +154,7 @@ impl VirtualNode { // This process doesn’t guarantee the order of rows, producing indeterminate results in some cases, // such as when `distinct on` is used without an `order by`. let (row, _) = data_chunk.row_at(idx); - row.hash(Crc32FastBuilder).into() + row.hash(Crc32FastBuilder).to_vnode(vnode_count) } }) .collect(); @@ -149,19 +163,29 @@ impl VirtualNode { data_chunk .get_hash_values(keys, Crc32FastBuilder) .into_iter() - .map(|hash| hash.into()) + .map(|hash| hash.to_vnode(vnode_count)) .collect() } + /// Equivalent to [`Self::compute_chunk`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + pub fn compute_chunk_for_test(data_chunk: &DataChunk, keys: &[usize]) -> Vec { + Self::compute_chunk(data_chunk, keys, Self::COUNT_FOR_TEST) + } + // `compute_row` is used to calculate the `VirtualNode` for the corresponding columns in a // `Row`. Similar to `compute_chunk`, it also contains special handling for serial columns. - pub fn compute_row(row: impl Row, indices: &[usize]) -> VirtualNode { + pub fn compute_row(row: impl Row, indices: &[usize], vnode_count: usize) -> VirtualNode { let project = row.project(indices); if let Ok(Some(ScalarRefImpl::Serial(s))) = project.iter().exactly_one().as_ref() { return extract_vnode_id_from_row_id(s.as_row_id()); } - project.hash(Crc32FastBuilder).into() + project.hash(Crc32FastBuilder).to_vnode(vnode_count) + } + + /// Equivalent to [`Self::compute_row`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + pub fn compute_row_for_test(row: impl Row, indices: &[usize]) -> VirtualNode { + Self::compute_row(row, indices, Self::COUNT_FOR_TEST) } } @@ -184,7 +208,7 @@ mod tests { ); let chunk = DataChunk::from_pretty(chunk.as_str()); - let vnodes = VirtualNode::compute_chunk(&chunk, &[0]); + let vnodes = VirtualNode::compute_chunk_for_test(&chunk, &[0]); assert_eq!( vnodes.as_slice(), @@ -200,7 +224,7 @@ mod tests { Some(ScalarImpl::Int64(12345)), ]); - let vnode = VirtualNode::compute_row(&row, &[0]); + let vnode = VirtualNode::compute_row_for_test(&row, &[0]); assert_eq!(vnode, VirtualNode::from_index(100)); } @@ -221,7 +245,7 @@ mod tests { ); let chunk = DataChunk::from_pretty(chunk.as_str()); - let vnodes = VirtualNode::compute_chunk(&chunk, &[0]); + let vnodes = VirtualNode::compute_chunk_for_test(&chunk, &[0]); assert_eq!( vnodes.as_slice(), diff --git a/src/common/src/hash/table_distribution.rs b/src/common/src/hash/table_distribution.rs index 480483bc96a5d..5275aca04adb3 100644 --- a/src/common/src/hash/table_distribution.rs +++ b/src/common/src/hash/table_distribution.rs @@ -184,7 +184,7 @@ impl TableDistribution { /// Get vnode value with `indices` on the given `row`. pub fn compute_vnode(row: impl Row, indices: &[usize], vnodes: &Bitmap) -> VirtualNode { assert!(!indices.is_empty()); - let vnode = VirtualNode::compute_row(&row, indices); + let vnode = VirtualNode::compute_row(&row, indices, vnodes.len()); check_vnode_is_set(vnode, vnodes); tracing::debug!(target: "events::storage::storage_table", "compute vnode: {:?} key {:?} => {}", row, indices, vnode); @@ -219,7 +219,7 @@ impl TableDistribution { .map(|idx| pk_indices[*idx]) .collect_vec(); - VirtualNode::compute_chunk(chunk, &dist_key_indices) + VirtualNode::compute_chunk(chunk, &dist_key_indices, vnodes.len()) .into_iter() .zip_eq_fast(chunk.visibility().iter()) .map(|(vnode, vis)| { diff --git a/src/common/src/util/row_id.rs b/src/common/src/util/row_id.rs index 508f418903413..7f22c17e925e4 100644 --- a/src/common/src/util/row_id.rs +++ b/src/common/src/util/row_id.rs @@ -52,6 +52,7 @@ pub struct RowIdGenerator { pub type RowId = i64; +// TODO(var-vnode): how should we handle this for different virtual node counts? #[inline] pub fn extract_vnode_id_from_row_id(id: RowId) -> VirtualNode { let vnode_id = ((id >> VNODE_ID_SHIFT_BITS) & (VNODE_ID_UPPER_BOUND as i64 - 1)) as u32; diff --git a/src/common/src/util/scan_range.rs b/src/common/src/util/scan_range.rs index 5d5e84ed32085..cfe209cf2c22a 100644 --- a/src/common/src/util/scan_range.rs +++ b/src/common/src/util/scan_range.rs @@ -173,7 +173,7 @@ mod tests { Some(ScalarImpl::from(514)), ]); - let vnode = VirtualNode::compute_row(&row, &[0, 1]); + let vnode = VirtualNode::compute_row_for_test(&row, &[0, 1]); assert_eq!(scan_range.try_compute_vnode(&dist), Some(vnode)); } @@ -203,7 +203,7 @@ mod tests { Some(ScalarImpl::from(114514)), ]); - let vnode = VirtualNode::compute_row(&row, &[2, 1]); + let vnode = VirtualNode::compute_row_for_test(&row, &[2, 1]); assert_eq!(scan_range.try_compute_vnode(&dist), Some(vnode)); } diff --git a/src/common/src/vnode_mapping/vnode_placement.rs b/src/common/src/vnode_mapping/vnode_placement.rs index 5619ffc6e0f96..ccaf67db6a3c8 100644 --- a/src/common/src/vnode_mapping/vnode_placement.rs +++ b/src/common/src/vnode_mapping/vnode_placement.rs @@ -123,7 +123,7 @@ pub fn place_vnode( } None => { // No hint is provided, assign all vnodes to `temp_pu`. - for vnode in VirtualNode::all() { + for vnode in VirtualNode::all(VirtualNode::COUNT) { temp_slot.balance += 1; temp_slot.builder.set(vnode.to_index(), true); } diff --git a/src/expr/impl/src/scalar/vnode.rs b/src/expr/impl/src/scalar/vnode.rs index e544c39f62499..edd4caa39970e 100644 --- a/src/expr/impl/src/scalar/vnode.rs +++ b/src/expr/impl/src/scalar/vnode.rs @@ -43,7 +43,8 @@ impl Expression for VnodeExpression { } async fn eval(&self, input: &DataChunk) -> Result { - let vnodes = VirtualNode::compute_chunk(input, &self.dist_key_indices); + // TODO(var-vnode): get vnode count from context + let vnodes = VirtualNode::compute_chunk(input, &self.dist_key_indices, VirtualNode::COUNT); let mut builder = I16ArrayBuilder::new(input.capacity()); vnodes .into_iter() @@ -52,8 +53,9 @@ impl Expression for VnodeExpression { } async fn eval_row(&self, input: &OwnedRow) -> Result { + // TODO(var-vnode): get vnode count from context Ok(Some( - VirtualNode::compute_row(input, &self.dist_key_indices) + VirtualNode::compute_row(input, &self.dist_key_indices, VirtualNode::COUNT) .to_scalar() .into(), )) diff --git a/src/meta/src/controller/fragment.rs b/src/meta/src/controller/fragment.rs index 16228a06d0a9a..31575e72804f9 100644 --- a/src/meta/src/controller/fragment.rs +++ b/src/meta/src/controller/fragment.rs @@ -1411,7 +1411,7 @@ mod tests { use std::collections::{BTreeMap, HashMap}; use itertools::Itertools; - use risingwave_common::hash::ActorMapping; + use risingwave_common::hash::{ActorMapping, VirtualNode}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_common::util::stream_graph_visitor::visit_stream_node; use risingwave_meta_model_v2::actor::ActorStatus; @@ -1497,8 +1497,11 @@ mod tests { }) .collect(); - let actor_bitmaps = - ActorMapping::new_uniform((0..actor_count).map(|i| i as _)).to_bitmaps(); + let actor_bitmaps = ActorMapping::new_uniform( + (0..actor_count).map(|i| i as _), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps(); let pb_actors = (0..actor_count) .map(|actor_id| { @@ -1610,8 +1613,11 @@ mod tests { }) .collect(); - let mut actor_bitmaps = - ActorMapping::new_uniform((0..actor_count).map(|i| i as _)).to_bitmaps(); + let mut actor_bitmaps = ActorMapping::new_uniform( + (0..actor_count).map(|i| i as _), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps(); let actors = (0..actor_count) .map(|actor_id| { diff --git a/src/meta/src/stream/stream_graph/schedule.rs b/src/meta/src/stream/stream_graph/schedule.rs index 0f9e473c26486..f338dd27725ca 100644 --- a/src/meta/src/stream/stream_graph/schedule.rs +++ b/src/meta/src/stream/stream_graph/schedule.rs @@ -25,7 +25,7 @@ use either::Either; use enum_as_inner::EnumAsInner; use itertools::Itertools; use risingwave_common::bitmap::Bitmap; -use risingwave_common::hash::{ActorMapping, WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{ActorMapping, VirtualNode, WorkerSlotId, WorkerSlotMapping}; use risingwave_common::{bail, hash}; use risingwave_pb::common::{ActorInfo, WorkerNode}; use risingwave_pb::meta::table_fragments::fragment::{ @@ -235,7 +235,8 @@ impl Scheduler { assert_eq!(scheduled_worker_slots.len(), parallelism); // Build the default hash mapping uniformly. - let default_hash_mapping = WorkerSlotMapping::build_from_ids(&scheduled_worker_slots); + let default_hash_mapping = + WorkerSlotMapping::build_from_ids(&scheduled_worker_slots, VirtualNode::COUNT); let single_scheduled = schedule_units_for_slots(&slots, 1, streaming_job_id)?; let default_single_worker_id = single_scheduled.keys().exactly_one().cloned().unwrap(); diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index a8e8bc47752a5..d8b1dc131fec1 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -764,8 +764,7 @@ mod tests { use std::time::Duration; use futures::{Stream, TryStreamExt}; - use risingwave_common::hash; - use risingwave_common::hash::{ActorMapping, WorkerSlotId}; + use risingwave_common::hash::{self, ActorMapping, VirtualNode, WorkerSlotId}; use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_pb::common::{HostAddress, WorkerType}; use risingwave_pb::meta::add_worker_node_request::Property; @@ -1137,12 +1136,14 @@ mod tests { } fn make_mview_stream_actors(table_id: &TableId, count: usize) -> Vec { - let mut actor_bitmaps: HashMap<_, _> = - ActorMapping::new_uniform((0..count).map(|i| i as hash::ActorId)) - .to_bitmaps() - .into_iter() - .map(|(actor_id, bitmap)| (actor_id, bitmap.to_protobuf())) - .collect(); + let mut actor_bitmaps: HashMap<_, _> = ActorMapping::new_uniform( + (0..count).map(|i| i as hash::ActorId), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps() + .into_iter() + .map(|(actor_id, bitmap)| (actor_id, bitmap.to_protobuf())) + .collect(); (0..count) .map(|i| StreamActor { diff --git a/src/meta/src/stream/test_scale.rs b/src/meta/src/stream/test_scale.rs index 0dc0bced84005..54e619e473cd0 100644 --- a/src/meta/src/stream/test_scale.rs +++ b/src/meta/src/stream/test_scale.rs @@ -26,7 +26,7 @@ mod tests { use crate::stream::CustomActorInfo; fn simulated_parallelism(min: Option, max: Option) -> Vec { - let mut raw = vec![1, 3, 12, 42, VirtualNode::COUNT]; + let mut raw = vec![1, 3, 12, 42, VirtualNode::COUNT_FOR_TEST]; if let Some(min) = min { raw.retain(|n| *n > min); raw.push(min); @@ -39,7 +39,9 @@ mod tests { } fn build_fake_actors(actor_ids: Vec) -> Vec { - let actor_bitmaps = ActorMapping::new_uniform(actor_ids.clone().into_iter()).to_bitmaps(); + let actor_bitmaps = + ActorMapping::new_uniform(actor_ids.clone().into_iter(), VirtualNode::COUNT_FOR_TEST) + .to_bitmaps(); actor_ids .iter() .map(|actor_id| CustomActorInfo { @@ -55,7 +57,7 @@ mod tests { fn check_affinity_for_scale_in(bitmap: &Bitmap, actor: &CustomActorInfo) { let prev_bitmap = Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()); - for idx in 0..VirtualNode::COUNT { + for idx in 0..VirtualNode::COUNT_FOR_TEST { if prev_bitmap.is_set(idx) { assert!(bitmap.is_set(idx)); } @@ -63,7 +65,9 @@ mod tests { } fn check_bitmaps(bitmaps: &HashMap) { - let mut target = (0..VirtualNode::COUNT).map(|_| false).collect_vec(); + let mut target = (0..VirtualNode::COUNT_FOR_TEST) + .map(|_| false) + .collect_vec(); for bitmap in bitmaps.values() { for (idx, pos) in target.iter_mut().enumerate() { @@ -89,9 +93,10 @@ mod tests { fn test_build_actor_mapping() { for parallelism in simulated_parallelism(None, None) { let actor_ids = (0..parallelism as ActorId).collect_vec(); - let actor_mapping = ActorMapping::new_uniform(actor_ids.into_iter()); + let actor_mapping = + ActorMapping::new_uniform(actor_ids.into_iter(), VirtualNode::COUNT_FOR_TEST); - assert_eq!(actor_mapping.len(), VirtualNode::COUNT); + assert_eq!(actor_mapping.len(), VirtualNode::COUNT_FOR_TEST); let mut check: HashMap> = HashMap::new(); for (vnode, actor_id) in actor_mapping.iter_with_vnode() { @@ -178,7 +183,7 @@ mod tests { #[test] fn test_rebalance_scale_out() { - for parallelism in simulated_parallelism(Some(3), Some(VirtualNode::COUNT - 1)) { + for parallelism in simulated_parallelism(Some(3), Some(VirtualNode::COUNT_FOR_TEST - 1)) { let actors = build_fake_actors((0..parallelism as ActorId).collect_vec()); // add 1 @@ -189,8 +194,9 @@ mod tests { let actors = build_fake_actors((0..parallelism as ActorId).collect_vec()); - // add to VirtualNode::COUNT - let actors_to_add = (parallelism as ActorId..VirtualNode::COUNT as ActorId).collect(); + // add to VirtualNode::COUNT_FOR_TEST + let actors_to_add = + (parallelism as ActorId..VirtualNode::COUNT_FOR_TEST as ActorId).collect(); let result = rebalance_actor_vnode(&actors, &BTreeSet::new(), &actors_to_add); assert_eq!(result.len(), actors.len() + actors_to_add.len()); check_bitmaps(&result); @@ -275,7 +281,7 @@ mod tests { #[test] fn test_rebalance_scale_real() { - let actor_ids = (0..(VirtualNode::COUNT - 1) as ActorId).collect_vec(); + let actor_ids = (0..(VirtualNode::COUNT_FOR_TEST - 1) as ActorId).collect_vec(); let actors = build_fake_actors(actor_ids); let actors_to_remove = btreeset! {0, 1}; let actors_to_add = btreeset! {255}; diff --git a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs index 5fc10cd0cc58a..3114c22e63323 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs @@ -143,7 +143,7 @@ pub(crate) fn gen_multi_vnode_stream_chunks( .collect_vec(); let (ops, rows) = gen_sized_test_data(base, max_count); for (op, row) in zip_eq(ops, rows) { - let vnode = VirtualNode::compute_row(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); + let vnode = VirtualNode::compute_row_for_test(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); let (ops, builder) = &mut data_builder[vnode.to_index() % MOD_COUNT]; ops.push(op); assert!(builder.append_one_row(row).is_none()); @@ -177,9 +177,9 @@ pub(crate) fn gen_test_log_store_table(pk_info: &'static KvLogStorePkInfo) -> Pb pub(crate) fn calculate_vnode_bitmap<'a>( test_data: impl Iterator)>, ) -> Bitmap { - let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); - for vnode in - test_data.map(|(_, row)| VirtualNode::compute_row(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT_FOR_TEST); + for vnode in test_data + .map(|(_, row)| VirtualNode::compute_row_for_test(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) { builder.set(vnode.to_index(), true); } diff --git a/src/stream/src/executor/dispatch.rs b/src/stream/src/executor/dispatch.rs index 82d11db49513b..4a43ff618ebf7 100644 --- a/src/stream/src/executor/dispatch.rs +++ b/src/stream/src/executor/dispatch.rs @@ -755,7 +755,8 @@ impl Dispatcher for HashDataDispatcher { let num_outputs = self.outputs.len(); // get hash value of every line by its key - let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys); + let vnode_count = self.hash_mapping.len(); + let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys, vnode_count); tracing::debug!(target: "events::stream::dispatch::hash", "\n{}\n keys {:?} => {:?}", chunk.to_pretty(), self.keys, vnodes); From 1d220eeba6f813673f8efc72b1fcf9e449171cf0 Mon Sep 17 00:00:00 2001 From: Xinhao Xu <84456268+xxhZs@users.noreply.github.com> Date: Thu, 5 Sep 2024 18:32:58 +0800 Subject: [PATCH 06/32] fix(error): fix sinkError and connector error (#18425) --- src/stream/src/executor/error.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/stream/src/executor/error.rs b/src/stream/src/executor/error.rs index fa625d8bb8cec..66070ba81e90c 100644 --- a/src/stream/src/executor/error.rs +++ b/src/stream/src/executor/error.rs @@ -67,7 +67,12 @@ pub enum ErrorKind { ), #[error("Sink error: sink_id={1}, error: {0}")] - SinkError(SinkError, u32), + SinkError( + #[source] + #[backtrace] + SinkError, + u32, + ), #[error(transparent)] RpcError( @@ -90,7 +95,11 @@ pub enum ErrorKind { AlignBarrier(Box, Box), #[error("Connector error: {0}")] - ConnectorError(BoxedError), + ConnectorError( + #[source] + #[backtrace] + BoxedError, + ), #[error(transparent)] DmlError( From 0134191ca8055543ece575e9f074f4ef3b23ac45 Mon Sep 17 00:00:00 2001 From: Li0k Date: Thu, 5 Sep 2024 18:38:35 +0800 Subject: [PATCH 07/32] fix(storage): fix correct_commit_ssts with sst table_ids (#18414) --- src/meta/src/hummock/manager/commit_epoch.rs | 35 +++++-------------- src/meta/src/hummock/manager/tests.rs | 17 ++++++++- .../compaction_group/hummock_version_ext.rs | 18 +++++----- src/storage/hummock_sdk/src/sstable_info.rs | 4 +++ .../hummock_test/src/hummock_storage_tests.rs | 15 +++++++- 5 files changed, 53 insertions(+), 36 deletions(-) diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs index 08428e5472e23..4f7a62da41779 100644 --- a/src/meta/src/hummock/manager/commit_epoch.rs +++ b/src/meta/src/hummock/manager/commit_epoch.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use risingwave_common::catalog::TableId; use risingwave_hummock_sdk::change_log::ChangeLogDelta; @@ -220,23 +220,8 @@ impl HummockManager { NewTableFragmentInfo::None => (HashMap::new(), None, None), }; - let mut group_members_table_ids: HashMap> = HashMap::new(); - { - // expand group_members_table_ids - for (table_id, group_id) in &table_compaction_group_mapping { - group_members_table_ids - .entry(*group_id) - .or_default() - .insert(*table_id); - } - } - let commit_sstables = self - .correct_commit_ssts( - sstables, - &table_compaction_group_mapping, - &group_members_table_ids, - ) + .correct_commit_ssts(sstables, &table_compaction_group_mapping) .await?; let modified_compaction_groups: Vec<_> = commit_sstables.keys().cloned().collect(); @@ -389,7 +374,6 @@ impl HummockManager { &self, sstables: Vec, table_compaction_group_mapping: &HashMap, - group_members_table_ids: &HashMap>, ) -> Result>> { let mut new_sst_id_number = 0; let mut sst_to_cg_vec = Vec::with_capacity(sstables.len()); @@ -424,17 +408,16 @@ impl HummockManager { let mut commit_sstables: BTreeMap> = BTreeMap::new(); for (mut sst, group_table_ids) in sst_to_cg_vec { - for (group_id, match_ids) in group_table_ids { - let group_members_table_ids = group_members_table_ids.get(&group_id).unwrap(); - if match_ids - .iter() - .all(|id| group_members_table_ids.contains(&TableId::new(*id))) - { + let len = group_table_ids.len(); + for (index, (group_id, match_ids)) in group_table_ids.into_iter().enumerate() { + if sst.sst_info.table_ids == match_ids { + // The SST contains all the tables in the group should be last key + assert!(index == len - 1); commit_sstables .entry(group_id) .or_default() - .push(sst.sst_info.clone()); - continue; + .push(sst.sst_info); + break; } let origin_sst_size = sst.sst_info.sst_size; diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index 56b4836f585a1..dca7311f4778f 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -1327,7 +1327,22 @@ async fn test_split_compaction_group_on_commit() { sst_size: 100, ..Default::default() }, - table_stats: Default::default(), + table_stats: HashMap::from([ + ( + 100, + TableStats { + total_compressed_size: 50, + ..Default::default() + }, + ), + ( + 101, + TableStats { + total_compressed_size: 50, + ..Default::default() + }, + ), + ]), }; hummock_manager .commit_epoch_for_test(30, vec![sst_1], HashMap::from([(10, context_id)])) diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index ca6585f46fd51..f24a125aa7f01 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -354,7 +354,7 @@ impl HummockVersion { &mut self, parent_group_id: CompactionGroupId, group_id: CompactionGroupId, - member_table_ids: HashSet, + member_table_ids: BTreeSet, new_sst_start_id: u64, ) { let mut new_sst_id = new_sst_start_id; @@ -594,7 +594,7 @@ impl HummockVersion { } else { #[expect(deprecated)] // for backward-compatibility of previous hummock version delta - HashSet::from_iter(group_construct.table_ids.clone()) + BTreeSet::from_iter(group_construct.table_ids.clone()) }; self.init_with_parent_group( @@ -614,7 +614,7 @@ impl HummockVersion { self.init_with_parent_group( group_change.origin_group_id, group_change.target_group_id, - HashSet::from_iter(group_change.table_ids.clone()), + BTreeSet::from_iter(group_change.table_ids.clone()), group_change.new_sst_start_id, ); @@ -998,7 +998,7 @@ pub fn build_initial_compaction_group_levels( } fn split_sst_info_for_level( - member_table_ids: &HashSet, + member_table_ids: &BTreeSet, level: &mut Level, new_sst_id: &mut u64, ) -> Vec { @@ -1338,7 +1338,7 @@ pub fn split_sst( new_sst_id: &mut u64, old_sst_size: u64, new_sst_size: u64, - new_sst_table_ids: Vec, + new_table_ids: Vec, ) -> SstableInfo { let mut branch_table_info = sst_info.clone(); branch_table_info.sst_id = *new_sst_id; @@ -1350,9 +1350,11 @@ pub fn split_sst( { // related github.com/risingwavelabs/risingwave/pull/17898/ // This is a temporary implementation that will update `table_ids`` based on the new split rule after PR 17898 - - let set1: HashSet<_> = sst_info.table_ids.iter().cloned().collect(); - let set2: HashSet<_> = new_sst_table_ids.iter().cloned().collect(); + // sst_info.table_ids = vec[1, 2, 3]; + // new_table_ids = vec[2, 3, 4]; + // branch_table_info.table_ids = vec[1, 2, 3] ∩ vec[2, 3, 4] = vec[2, 3] + let set1: BTreeSet<_> = sst_info.table_ids.iter().cloned().collect(); + let set2: BTreeSet<_> = new_table_ids.into_iter().collect(); let intersection: Vec<_> = set1.intersection(&set2).cloned().collect(); // Update table_ids diff --git a/src/storage/hummock_sdk/src/sstable_info.rs b/src/storage/hummock_sdk/src/sstable_info.rs index 9970c60f506c8..20943e4dd101a 100644 --- a/src/storage/hummock_sdk/src/sstable_info.rs +++ b/src/storage/hummock_sdk/src/sstable_info.rs @@ -63,6 +63,7 @@ impl SstableInfo { impl From for SstableInfo { fn from(pb_sstable_info: PbSstableInfo) -> Self { + assert!(pb_sstable_info.table_ids.is_sorted()); Self { object_id: pb_sstable_info.object_id, sst_id: pb_sstable_info.sst_id, @@ -100,6 +101,7 @@ impl From for SstableInfo { impl From<&PbSstableInfo> for SstableInfo { fn from(pb_sstable_info: &PbSstableInfo) -> Self { + assert!(pb_sstable_info.table_ids.is_sorted()); Self { object_id: pb_sstable_info.object_id, sst_id: pb_sstable_info.sst_id, @@ -137,6 +139,7 @@ impl From<&PbSstableInfo> for SstableInfo { impl From for PbSstableInfo { fn from(sstable_info: SstableInfo) -> Self { assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); + assert!(sstable_info.table_ids.is_sorted()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -175,6 +178,7 @@ impl From for PbSstableInfo { impl From<&SstableInfo> for PbSstableInfo { fn from(sstable_info: &SstableInfo) -> Self { assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); + assert!(sstable_info.table_ids.is_sorted()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, diff --git a/src/storage/hummock_test/src/hummock_storage_tests.rs b/src/storage/hummock_test/src/hummock_storage_tests.rs index 7f3d35f16b80b..fc0fd6ae97b4f 100644 --- a/src/storage/hummock_test/src/hummock_storage_tests.rs +++ b/src/storage/hummock_test/src/hummock_storage_tests.rs @@ -31,6 +31,7 @@ use risingwave_hummock_sdk::key::{ gen_key_from_bytes, prefixed_range_with_vnode, FullKey, TableKey, UserKey, TABLE_PREFIX_LEN, }; use risingwave_hummock_sdk::sstable_info::SstableInfo; +use risingwave_hummock_sdk::table_stats::TableStats; use risingwave_hummock_sdk::table_watermark::{ TableWatermarksIndex, VnodeWatermark, WatermarkDirection, }; @@ -2510,8 +2511,20 @@ async fn test_commit_multi_epoch() { new_table_watermarks: Default::default(), sst_to_context: context_id_map(&[sst.object_id]), sstables: vec![LocalSstableInfo { + table_stats: sst + .table_ids + .iter() + .map(|&table_id| { + ( + table_id, + TableStats { + total_compressed_size: 10, + ..Default::default() + }, + ) + }) + .collect(), sst_info: sst, - table_stats: Default::default(), }], new_table_fragment_info, change_log_delta: Default::default(), From 79ba86964acf0616693a8534c9eac1a67eb2889d Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 5 Sep 2024 19:20:02 +0800 Subject: [PATCH 08/32] fix(backup): fix restore order (#18423) --- src/meta/src/backup_restore/restore_impl/v2.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meta/src/backup_restore/restore_impl/v2.rs b/src/meta/src/backup_restore/restore_impl/v2.rs index a887293e0c8ef..938050ce4d300 100644 --- a/src/meta/src/backup_restore/restore_impl/v2.rs +++ b/src/meta/src/backup_restore/restore_impl/v2.rs @@ -106,8 +106,8 @@ impl Writer for WriterModelV2ToMetaStoreV2 { insert_models(metadata.workers.clone(), db).await?; insert_models(metadata.worker_properties.clone(), db).await?; insert_models(metadata.users.clone(), db).await?; - insert_models(metadata.user_privileges.clone(), db).await?; insert_models(metadata.objects.clone(), db).await?; + insert_models(metadata.user_privileges.clone(), db).await?; insert_models(metadata.object_dependencies.clone(), db).await?; insert_models(metadata.databases.clone(), db).await?; insert_models(metadata.schemas.clone(), db).await?; From df7f54f0b35cfe39937ae49bf9b5dab4343d51de Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 5 Sep 2024 22:59:18 +0800 Subject: [PATCH 09/32] fix(meta): fix catalog ref count (#18427) --- e2e_test/backup_restore/tpch_snapshot_create.slt | 3 +++ e2e_test/backup_restore/tpch_snapshot_drop.slt | 3 +++ src/meta/src/manager/catalog/user.rs | 1 + src/storage/backup/integration_tests/test_basic.sh | 14 +++++++++++++- 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/e2e_test/backup_restore/tpch_snapshot_create.slt b/e2e_test/backup_restore/tpch_snapshot_create.slt index c1fad2a2e0759..bb14dd369b837 100644 --- a/e2e_test/backup_restore/tpch_snapshot_create.slt +++ b/e2e_test/backup_restore/tpch_snapshot_create.slt @@ -1,5 +1,8 @@ include ../tpch/create_tables.slt.part +statement ok +CREATE SECRET secret1 WITH (backend = 'meta') AS 'demo-secret' + # First, insert the data into the tables include ../tpch/insert_customer.slt.part include ../tpch/insert_lineitem.slt.part diff --git a/e2e_test/backup_restore/tpch_snapshot_drop.slt b/e2e_test/backup_restore/tpch_snapshot_drop.slt index 0e593371347b7..27d271c35c617 100644 --- a/e2e_test/backup_restore/tpch_snapshot_drop.slt +++ b/e2e_test/backup_restore/tpch_snapshot_drop.slt @@ -1,3 +1,6 @@ +statement ok +DROP SECRET secret1; + statement ok drop materialized view tpch_q7; diff --git a/src/meta/src/manager/catalog/user.rs b/src/meta/src/manager/catalog/user.rs index 81181b0fc1e17..68e5e31395c0d 100644 --- a/src/meta/src/manager/catalog/user.rs +++ b/src/meta/src/manager/catalog/user.rs @@ -74,6 +74,7 @@ impl UserManager { .values() .map(|connection| connection.owner), ) + .chain(database.secrets.values().map(|secret| secret.owner)) .for_each(|owner_id| user_manager.increase_ref(owner_id)); Ok(user_manager) diff --git a/src/storage/backup/integration_tests/test_basic.sh b/src/storage/backup/integration_tests/test_basic.sh index afaee3ac6c507..9674807e62c6e 100644 --- a/src/storage/backup/integration_tests/test_basic.sh +++ b/src/storage/backup/integration_tests/test_basic.sh @@ -34,12 +34,20 @@ if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | gr echo "expect 0 MV" exit 1 fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "0 row"; then + echo "expect 0 SECRET" + exit 1 +fi echo "restore snapshot ${job_id_1} succeeded" restore "${job_id_2}" start_cluster if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | grep -q "1 row"; then - echo "expect 1 MVs" + echo "expect 1 MV" + exit 1 +fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "1 row"; then + echo "expect 1 SECRET" exit 1 fi echo "restore snapshot ${job_id_2} succeeded" @@ -55,6 +63,10 @@ if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | gr echo "expect 0 MV" exit 1 fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "0 row"; then + echo "expect 0 SECRET" + exit 1 +fi echo "restore snapshot ${job_id_3} succeeded" echo "test succeeded" From 6489e9ac8cb656810196a71426ee7dcf3467a2bd Mon Sep 17 00:00:00 2001 From: xxchan Date: Fri, 6 Sep 2024 12:59:14 +0800 Subject: [PATCH 10/32] feat: track progress for SourceBackfill (blocking DDL) (#18112) Signed-off-by: xxchan --- Cargo.lock | 1 + src/meta/Cargo.toml | 1 + src/meta/src/barrier/command.rs | 4 +- src/meta/src/barrier/progress.rs | 10 +++ src/meta/src/manager/metadata.rs | 1 + src/meta/src/model/stream.rs | 4 +- src/meta/src/stream/scale.rs | 2 + .../source/source_backfill_executor.rs | 81 ++++++++++++++++--- .../source/source_backfill_state_table.rs | 1 + src/stream/src/from_proto/source_backfill.rs | 4 + .../src/task/barrier_manager/managed_state.rs | 3 + 11 files changed, 100 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e648e08a3fea..99423896f89f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11213,6 +11213,7 @@ dependencies = [ "comfy-table", "crepe", "easy-ext", + "educe", "either", "enum-as-inner 0.6.0", "expect-test", diff --git a/src/meta/Cargo.toml b/src/meta/Cargo.toml index 4511e9f61d894..a7f37bf505910 100644 --- a/src/meta/Cargo.toml +++ b/src/meta/Cargo.toml @@ -28,6 +28,7 @@ clap = { workspace = true } comfy-table = "7" crepe = "0.1" easy-ext = "1" +educe = "0.6" either = "1" enum-as-inner = "0.6" etcd-client = { workspace = true } diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index 6e4ebe40b93b0..a1c82ccd8db83 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -147,8 +147,10 @@ impl ReplaceTablePlan { } } -#[derive(Debug, Clone)] +#[derive(educe::Educe, Clone)] +#[educe(Debug)] pub struct CreateStreamingJobCommandInfo { + #[educe(Debug(ignore))] pub table_fragments: TableFragments, /// Refer to the doc on [`MetadataManager::get_upstream_root_fragments`] for the meaning of "root". pub upstream_root_actors: HashMap>, diff --git a/src/meta/src/barrier/progress.rs b/src/meta/src/barrier/progress.rs index 5754e4c60e364..2e1b6f9dc397a 100644 --- a/src/meta/src/barrier/progress.rs +++ b/src/meta/src/barrier/progress.rs @@ -55,6 +55,7 @@ pub(super) struct Progress { upstream_mv_count: HashMap, /// Total key count in the upstream materialized view + /// TODO: implement this for source backfill upstream_total_key_count: u64, /// Consumed rows @@ -122,6 +123,12 @@ impl Progress { /// Returns whether all backfill executors are done. fn is_done(&self) -> bool { + tracing::trace!( + "Progress::is_done? {}, {}, {:?}", + self.done_count, + self.states.len(), + self.states + ); self.done_count == self.states.len() } @@ -274,6 +281,7 @@ pub(super) struct TrackingCommand { /// 4. With `actor_map` we can use an actor's `ActorId` to find the ID of the `StreamJob`. #[derive(Default, Debug)] pub(super) struct CreateMviewProgressTracker { + // TODO: add a specialized progress for source /// Progress of the create-mview DDL indicated by the `TableId`. progress_map: HashMap, @@ -494,6 +502,7 @@ impl CreateMviewProgressTracker { replace_table: Option<&ReplaceTablePlan>, version_stats: &HummockVersionStats, ) -> Option { + tracing::trace!(?info, "add job to track"); let (info, actors, replace_table_info) = { let CreateStreamingJobCommandInfo { table_fragments, .. @@ -596,6 +605,7 @@ impl CreateMviewProgressTracker { progress: &CreateMviewProgress, version_stats: &HummockVersionStats, ) -> Option { + tracing::trace!(?progress, "update progress"); let actor = progress.backfill_actor_id; let Some(table_id) = self.actor_map.get(&actor).copied() else { // On restart, backfill will ALWAYS notify CreateMviewProgressTracker, diff --git a/src/meta/src/manager/metadata.rs b/src/meta/src/manager/metadata.rs index 52fc811787d30..935d4773865ed 100644 --- a/src/meta/src/manager/metadata.rs +++ b/src/meta/src/manager/metadata.rs @@ -917,6 +917,7 @@ impl MetadataManager { &self, job: &StreamingJob, ) -> MetaResult { + tracing::debug!("wait_streaming_job_finished: {job:?}"); match self { MetadataManager::V1(mgr) => mgr.wait_streaming_job_finished(job).await, MetadataManager::V2(mgr) => mgr.wait_streaming_job_finished(job.id() as _).await, diff --git a/src/meta/src/model/stream.rs b/src/meta/src/model/stream.rs index 447cf5cf85645..aaff076688785 100644 --- a/src/meta/src/model/stream.rs +++ b/src/meta/src/model/stream.rs @@ -363,7 +363,9 @@ impl TableFragments { return vec![]; } if (fragment.fragment_type_mask - & (FragmentTypeFlag::Values as u32 | FragmentTypeFlag::StreamScan as u32)) + & (FragmentTypeFlag::Values as u32 + | FragmentTypeFlag::StreamScan as u32 + | FragmentTypeFlag::SourceScan as u32)) != 0 { actor_ids.extend(fragment.actors.iter().map(|actor| actor.actor_id)); diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index d10fa83710d85..8e9b5be690439 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -441,6 +441,8 @@ pub struct ScaleController { pub env: MetaSrvEnv, + /// We will acquire lock during DDL to prevent scaling operations on jobs that are in the creating state. + /// e.g., a MV cannot be rescheduled during foreground backfill. pub reschedule_lock: RwLock<()>, } diff --git a/src/stream/src/executor/source/source_backfill_executor.rs b/src/stream/src/executor/source/source_backfill_executor.rs index b28c707bdedd0..09a4d0a40f1cb 100644 --- a/src/stream/src/executor/source/source_backfill_executor.rs +++ b/src/stream/src/executor/source/source_backfill_executor.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; +use std::sync::Once; use std::time::Instant; use anyhow::anyhow; @@ -30,6 +31,7 @@ use risingwave_connector::source::{ BackfillInfo, BoxChunkSourceStream, SourceContext, SourceCtrlOpts, SplitId, SplitImpl, SplitMetaData, }; +use risingwave_hummock_sdk::HummockReadEpoch; use serde::{Deserialize, Serialize}; use thiserror_ext::AsReport; @@ -40,6 +42,7 @@ use crate::common::rate_limit::limited_chunk_size; use crate::executor::prelude::*; use crate::executor::source::source_executor::WAIT_BARRIER_MULTIPLE_TIMES; use crate::executor::UpdateMutation; +use crate::task::CreateMviewProgress; #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub enum BackfillState { @@ -88,6 +91,8 @@ pub struct SourceBackfillExecutorInner { /// Rate limit in rows/s. rate_limit_rps: Option, + + progress: CreateMviewProgress, } /// Local variables used in the backfill stage. @@ -230,6 +235,7 @@ impl BackfillStage { } impl SourceBackfillExecutorInner { + #[expect(clippy::too_many_arguments)] pub fn new( actor_ctx: ActorContextRef, info: ExecutorInfo, @@ -238,6 +244,7 @@ impl SourceBackfillExecutorInner { system_params: SystemParamsReaderRef, backfill_state_store: BackfillStateTableHandler, rate_limit_rps: Option, + progress: CreateMviewProgress, ) -> Self { let source_split_change_count = metrics .source_split_change_count @@ -247,6 +254,7 @@ impl SourceBackfillExecutorInner { &actor_ctx.id.to_string(), &actor_ctx.fragment_id.to_string(), ]); + Self { actor_ctx, info, @@ -256,6 +264,7 @@ impl SourceBackfillExecutorInner { source_split_change_count, system_params, rate_limit_rps, + progress, } } @@ -346,7 +355,6 @@ impl SourceBackfillExecutorInner { splits: owned_splits, }; backfill_stage.debug_assert_consistent(); - tracing::debug!(?backfill_stage, "source backfill started"); // Return the ownership of `stream_source_core` to the source executor. self.stream_source_core = core; @@ -370,6 +378,7 @@ impl SourceBackfillExecutorInner { } } } + tracing::debug!(?backfill_stage, "source backfill started"); fn select_strategy(_: &mut ()) -> PollNext { futures::stream::PollNext::Left @@ -407,9 +416,23 @@ impl SourceBackfillExecutorInner { pause_reader!(); } + let state_store = self.backfill_state_store.state_store.state_store().clone(); + static STATE_TABLE_INITIALIZED: Once = Once::new(); + tokio::spawn(async move { + // This is for self.backfill_finished() to be safe. + // We wait for 1st epoch's curr, i.e., the 2nd epoch's prev. + let epoch = barrier.epoch.curr; + tracing::info!("waiting for epoch: {}", epoch); + state_store + .try_wait_epoch(HummockReadEpoch::Committed(epoch)) + .await + .expect("failed to wait epoch"); + STATE_TABLE_INITIALIZED.call_once(|| ()); + tracing::info!("finished waiting for epoch: {}", epoch); + }); yield Message::Barrier(barrier); - if !self.backfill_finished(&backfill_stage.states).await? { + { let source_backfill_row_count = self .metrics .source_backfill_row_count @@ -552,10 +575,26 @@ impl SourceBackfillExecutorInner { .commit(barrier.epoch) .await?; - yield Message::Barrier(barrier); - - if self.backfill_finished(&backfill_stage.states).await? { - break 'backfill_loop; + if self.should_report_finished(&backfill_stage.states) { + // TODO: use a specialized progress for source + // Currently, `CreateMviewProgress` is designed for MV backfill, and rw_ddl_progress calculates + // progress based on the number of consumed rows and an estimated total number of rows from hummock. + // For now, we just rely on the same code path, and for source backfill, the progress will always be 99.99%. + tracing::info!("progress finish"); + let epoch = barrier.epoch; + self.progress.finish(epoch, 114514); + // yield barrier after reporting progress + yield Message::Barrier(barrier); + + // After we reported finished, we still don't exit the loop. + // Because we need to handle split migration. + if STATE_TABLE_INITIALIZED.is_completed() + && self.backfill_finished(&backfill_stage.states).await? + { + break 'backfill_loop; + } + } else { + yield Message::Barrier(barrier); } } Message::Chunk(chunk) => { @@ -665,7 +704,7 @@ impl SourceBackfillExecutorInner { self.apply_split_change_forward_stage( actor_splits, &mut splits, - true, + false, ) .await?; } @@ -688,11 +727,34 @@ impl SourceBackfillExecutorInner { } } - /// All splits finished backfilling. + /// When we should call `progress.finish()` to let blocking DDL return. + /// We report as soon as `SourceCachingUp`. Otherwise the DDL might be blocked forever until upstream messages come. + /// + /// Note: split migration (online scaling) is related with progress tracking. + /// - For foreground DDL, scaling is not allowed before progress is finished. + /// - For background DDL, scaling is skipped when progress is not finished, and can be triggered by recreating actors during recovery. + /// + /// See for more details. + fn should_report_finished(&self, states: &BackfillStates) -> bool { + states.values().all(|state| { + matches!( + state, + BackfillState::Finished | BackfillState::SourceCachingUp(_) + ) + }) + } + + /// All splits entered `Finished` state. /// /// We check all splits for the source, including other actors' splits here, before going to the forward stage. - /// Otherwise if we break early, but after rescheduling, an unfinished split is migrated to + /// Otherwise if we `break` early, but after rescheduling, an unfinished split is migrated to /// this actor, we still need to backfill it. + /// + /// Note: at the beginning, the actor will only read the state written by itself. + /// It needs to _wait until it can read all actors' written data_. + /// i.e., wait for the first checkpoint has been available. + /// + /// See for more details. async fn backfill_finished(&self, states: &BackfillStates) -> StreamExecutorResult { Ok(states .values() @@ -761,7 +823,6 @@ impl SourceBackfillExecutorInner { } Some(backfill_state) => { // Migrated split. Backfill if unfinished. - // TODO: disallow online scaling during backfilling. target_state.insert(split_id, backfill_state); } } diff --git a/src/stream/src/executor/source/source_backfill_state_table.rs b/src/stream/src/executor/source/source_backfill_state_table.rs index be9abe8490e63..3579aff2ec4fb 100644 --- a/src/stream/src/executor/source/source_backfill_state_table.rs +++ b/src/stream/src/executor/source/source_backfill_state_table.rs @@ -76,6 +76,7 @@ impl BackfillStateTableHandler { }; ret.push(state); } + tracing::trace!("scan SourceBackfill state table: {:?}", ret); Ok(ret) } diff --git a/src/stream/src/from_proto/source_backfill.rs b/src/stream/src/from_proto/source_backfill.rs index ba3ab599af700..65329a26bd40b 100644 --- a/src/stream/src/from_proto/source_backfill.rs +++ b/src/stream/src/from_proto/source_backfill.rs @@ -72,6 +72,9 @@ impl ExecutorBuilder for SourceBackfillExecutorBuilder { source_desc_builder, state_table_handler, ); + let progress = params + .local_barrier_manager + .register_create_mview_progress(params.actor_context.id); let exec = SourceBackfillExecutorInner::new( params.actor_context.clone(), @@ -81,6 +84,7 @@ impl ExecutorBuilder for SourceBackfillExecutorBuilder { params.env.system_params_manager_ref().get_params(), backfill_state_table, node.rate_limit, + progress, ); let [input]: [_; 1] = params.input.try_into().unwrap(); diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index 5ccde5004801d..6f21e32adc107 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -372,6 +372,9 @@ pub(super) struct PartialGraphManagedBarrierState { prev_barrier_table_ids: Option<(EpochPair, HashSet)>, /// Record the progress updates of creating mviews for each epoch of concurrent checkpoints. + /// + /// This is updated by [`super::CreateMviewProgress::update`] and will be reported to meta + /// in [`BarrierCompleteResult`]. pub(super) create_mview_progress: HashMap>, pub(super) state_store: StateStoreImpl, From 5311069848eb3d74bb71fa7d3bafa8c7967420d1 Mon Sep 17 00:00:00 2001 From: StrikeW Date: Fri, 6 Sep 2024 14:38:55 +0800 Subject: [PATCH 11/32] fix(sink): set query timeout for jdbc sink to avoid stuck (#18430) --- .../java/com/risingwave/connector/JDBCSink.java | 16 +++++++++++----- .../com/risingwave/connector/JDBCSinkConfig.java | 7 +++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java index 10aa371c50aec..02297a4ea57dd 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java @@ -71,12 +71,13 @@ public JDBCSink(JDBCSinkConfig config, TableSchema tableSchema) { .collect(Collectors.toList()); LOG.info( - "schema = {}, table = {}, tableSchema = {}, columnSqlTypes = {}, pkIndices = {}", + "schema = {}, table = {}, tableSchema = {}, columnSqlTypes = {}, pkIndices = {}, queryTimeout = {}", config.getSchemaName(), config.getTableName(), tableSchema, columnSqlTypes, - pkIndices); + pkIndices, + config.getQueryTimeout()); if (factory.isPresent()) { this.jdbcDialect = factory.get().create(columnSqlTypes, pkIndices); @@ -92,7 +93,7 @@ public JDBCSink(JDBCSinkConfig config, TableSchema tableSchema) { // Commit the `getTransactionIsolation` conn.commit(); - jdbcStatements = new JdbcStatements(conn); + jdbcStatements = new JdbcStatements(conn, config.getQueryTimeout()); } catch (SQLException e) { throw Status.INTERNAL .withDescription( @@ -173,7 +174,7 @@ public boolean write(Iterable rows) { conn = JdbcUtils.getConnection(config.getJdbcUrl()); // reset the flag since we will retry to prepare the batch again updateFlag = false; - jdbcStatements = new JdbcStatements(conn); + jdbcStatements = new JdbcStatements(conn, config.getQueryTimeout()); } else { throw io.grpc.Status.INTERNAL .withDescription( @@ -206,13 +207,15 @@ public boolean write(Iterable rows) { * across multiple batches if only the JDBC connection is valid. */ class JdbcStatements implements AutoCloseable { + private final int queryTimeoutSecs; private PreparedStatement deleteStatement; private PreparedStatement upsertStatement; private PreparedStatement insertStatement; private final Connection conn; - public JdbcStatements(Connection conn) throws SQLException { + public JdbcStatements(Connection conn, int queryTimeoutSecs) throws SQLException { + this.queryTimeoutSecs = queryTimeoutSecs; this.conn = conn; var schemaTableName = jdbcDialect.createSchemaTableName( @@ -339,6 +342,9 @@ private void executeStatement(PreparedStatement stmt) throws SQLException { if (stmt == null) { return; } + // if timeout occurs, a SQLTimeoutException will be thrown + // and we will retry to write the stream chunk in `JDBCSink.write` + stmt.setQueryTimeout(queryTimeoutSecs); LOG.debug("Executing statement: {}", stmt); stmt.executeBatch(); stmt.clearParameters(); diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java index ca74ac6a8eb74..94eb5cdc7e0ff 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java @@ -32,6 +32,9 @@ public class JDBCSinkConfig extends CommonSinkConfig { @JsonProperty(value = "schema.name") private String schemaName; + @JsonProperty(value = "jdbc.query.timeout") + private int queryTimeoutSeconds = 600; + @JsonCreator public JDBCSinkConfig( @JsonProperty(value = "jdbc.url") String jdbcUrl, @@ -62,4 +65,8 @@ public String getSinkType() { public boolean isUpsertSink() { return this.isUpsertSink; } + + public int getQueryTimeout() { + return queryTimeoutSeconds; + } } From b00d750abe8c03d52b0a18af8077ed0f7ad3aa70 Mon Sep 17 00:00:00 2001 From: Li0k Date: Fri, 6 Sep 2024 14:39:44 +0800 Subject: [PATCH 12/32] fix(storage): fix duplicated sst_id generated by split_sst function (#18431) --- .../hummock_sdk/src/compaction_group/hummock_version_ext.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index f24a125aa7f01..c54dd05b25d28 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -1343,9 +1343,11 @@ pub fn split_sst( let mut branch_table_info = sst_info.clone(); branch_table_info.sst_id = *new_sst_id; branch_table_info.sst_size = new_sst_size; + *new_sst_id += 1; - sst_info.sst_id = *new_sst_id + 1; + sst_info.sst_id = *new_sst_id; sst_info.sst_size = old_sst_size; + *new_sst_id += 1; { // related github.com/risingwavelabs/risingwave/pull/17898/ @@ -1364,8 +1366,6 @@ pub fn split_sst( .retain(|table_id| !branch_table_info.table_ids.contains(table_id)); } - *new_sst_id += 1; - branch_table_info } From 4accc4a6c641871d73095443875d0f0b1c573d9e Mon Sep 17 00:00:00 2001 From: "Zhanxiang (Patrick) Huang" Date: Fri, 6 Sep 2024 14:46:54 +0800 Subject: [PATCH 13/32] feat: introduce cluster limit (#18383) --- e2e_test/batch/catalog/pg_settings.slt.part | 1 + proto/meta.proto | 27 ++++ src/common/src/config.rs | 18 +++ src/common/src/session_config/mod.rs | 6 + src/common/src/util/cluster_limit.rs | 134 ++++++++++++++++++ src/common/src/util/mod.rs | 1 + src/config/example.toml | 2 + .../catalog/system_catalog/rw_catalog/mod.rs | 1 + .../rw_catalog/rw_worker_actor_count.rs | 31 ++++ src/frontend/src/handler/create_mv.rs | 3 + src/frontend/src/handler/create_sink.rs | 2 + src/frontend/src/handler/create_table.rs | 2 + src/frontend/src/meta_client.rs | 7 + src/frontend/src/session.rs | 44 +++++- src/frontend/src/test_utils.rs | 7 +- src/meta/node/src/lib.rs | 8 ++ src/meta/node/src/server.rs | 6 +- src/meta/service/src/cluster_limit_service.rs | 107 ++++++++++++++ src/meta/service/src/lib.rs | 1 + src/meta/src/manager/env.rs | 28 ++-- src/rpc_client/src/meta_client.rs | 15 +- 21 files changed, 439 insertions(+), 12 deletions(-) create mode 100644 src/common/src/util/cluster_limit.rs create mode 100644 src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs create mode 100644 src/meta/service/src/cluster_limit_service.rs diff --git a/e2e_test/batch/catalog/pg_settings.slt.part b/e2e_test/batch/catalog/pg_settings.slt.part index 3482ce4850246..e05d466c3a4d6 100644 --- a/e2e_test/batch/catalog/pg_settings.slt.part +++ b/e2e_test/batch/catalog/pg_settings.slt.part @@ -22,6 +22,7 @@ user backfill_rate_limit user background_ddl user batch_enable_distributed_dml user batch_parallelism +user bypass_cluster_limits user bytea_output user cdc_source_wait_streaming_start_timeout user client_encoding diff --git a/proto/meta.proto b/proto/meta.proto index 8932dcbc9e033..98a7f267c0124 100644 --- a/proto/meta.proto +++ b/proto/meta.proto @@ -791,3 +791,30 @@ message RelationIdInfos { // relation_id -> FragmentIdToActorIdMap map map = 1; } + +message ActorCountPerParallelism { + message WorkerActorCount { + uint64 actor_count = 1; + uint64 parallelism = 2; + } + map worker_id_to_actor_count = 1; + uint64 hard_limit = 2; + uint64 soft_limit = 3; +} + +message ClusterLimit { + oneof limit { + ActorCountPerParallelism actor_count = 1; + // TODO: limit DDL using compaction pending bytes + } +} + +message GetClusterLimitsRequest {} + +message GetClusterLimitsResponse { + repeated ClusterLimit active_limits = 1; +} + +service ClusterLimitService { + rpc GetClusterLimits(GetClusterLimitsRequest) returns (GetClusterLimitsResponse); +} diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 88ea110869b79..ed7ac8619252c 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -466,6 +466,16 @@ pub struct MetaDeveloperConfig { #[serde(default = "default::developer::max_get_task_probe_times")] pub max_get_task_probe_times: usize, + + /// Max number of actor allowed per parallelism (default = 100). + /// CREATE MV/Table will be noticed when the number of actors exceeds this limit. + #[serde(default = "default::developer::actor_cnt_per_worker_parallelism_soft_limit")] + pub actor_cnt_per_worker_parallelism_soft_limit: usize, + + /// Max number of actor allowed per parallelism (default = 400). + /// CREATE MV/Table will be rejected when the number of actors exceeds this limit. + #[serde(default = "default::developer::actor_cnt_per_worker_parallelism_hard_limit")] + pub actor_cnt_per_worker_parallelism_hard_limit: usize, } /// The section `[server]` in `risingwave.toml`. @@ -1859,6 +1869,14 @@ pub mod default { 5 } + pub fn actor_cnt_per_worker_parallelism_soft_limit() -> usize { + 100 + } + + pub fn actor_cnt_per_worker_parallelism_hard_limit() -> usize { + 400 + } + pub fn memory_controller_threshold_aggressive() -> f64 { 0.9 } diff --git a/src/common/src/session_config/mod.rs b/src/common/src/session_config/mod.rs index ffdbe6753acb5..163aa18799390 100644 --- a/src/common/src/session_config/mod.rs +++ b/src/common/src/session_config/mod.rs @@ -292,6 +292,12 @@ pub struct SessionConfig { #[parameter(default = "hex", check_hook = check_bytea_output)] bytea_output: String, + + /// Bypass checks on cluster limits + /// + /// When enabled, `CREATE MATERIALIZED VIEW` will not fail if the cluster limit is hit. + #[parameter(default = false)] + bypass_cluster_limits: bool, } fn check_timezone(val: &str) -> Result<(), String> { diff --git a/src/common/src/util/cluster_limit.rs b/src/common/src/util/cluster_limit.rs new file mode 100644 index 0000000000000..048ea4fdab305 --- /dev/null +++ b/src/common/src/util/cluster_limit.rs @@ -0,0 +1,134 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{self, Display, Formatter}; + +use risingwave_pb::meta::actor_count_per_parallelism::PbWorkerActorCount; +use risingwave_pb::meta::cluster_limit::PbLimit; +use risingwave_pb::meta::{PbActorCountPerParallelism, PbClusterLimit}; +pub enum ClusterLimit { + ActorCount(ActorCountPerParallelism), +} + +impl From for PbClusterLimit { + fn from(limit: ClusterLimit) -> Self { + match limit { + ClusterLimit::ActorCount(actor_count_per_parallelism) => PbClusterLimit { + limit: Some(PbLimit::ActorCount(actor_count_per_parallelism.into())), + }, + } + } +} + +impl From for ClusterLimit { + fn from(pb_limit: PbClusterLimit) -> Self { + match pb_limit.limit.unwrap() { + PbLimit::ActorCount(actor_count_per_parallelism) => { + ClusterLimit::ActorCount(actor_count_per_parallelism.into()) + } + } + } +} + +#[derive(Debug)] +pub struct WorkerActorCount { + pub actor_count: usize, + pub parallelism: usize, +} + +impl From for PbWorkerActorCount { + fn from(worker_actor_count: WorkerActorCount) -> Self { + PbWorkerActorCount { + actor_count: worker_actor_count.actor_count as u64, + parallelism: worker_actor_count.parallelism as u64, + } + } +} + +impl From for WorkerActorCount { + fn from(pb_worker_actor_count: PbWorkerActorCount) -> Self { + WorkerActorCount { + actor_count: pb_worker_actor_count.actor_count as usize, + parallelism: pb_worker_actor_count.parallelism as usize, + } + } +} + +pub struct ActorCountPerParallelism { + pub worker_id_to_actor_count: HashMap, + pub hard_limit: usize, + pub soft_limit: usize, +} + +impl From for PbActorCountPerParallelism { + fn from(actor_count_per_parallelism: ActorCountPerParallelism) -> Self { + PbActorCountPerParallelism { + worker_id_to_actor_count: actor_count_per_parallelism + .worker_id_to_actor_count + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + hard_limit: actor_count_per_parallelism.hard_limit as u64, + soft_limit: actor_count_per_parallelism.soft_limit as u64, + } + } +} + +impl From for ActorCountPerParallelism { + fn from(pb_actor_count_per_parallelism: PbActorCountPerParallelism) -> Self { + ActorCountPerParallelism { + worker_id_to_actor_count: pb_actor_count_per_parallelism + .worker_id_to_actor_count + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + hard_limit: pb_actor_count_per_parallelism.hard_limit as usize, + soft_limit: pb_actor_count_per_parallelism.soft_limit as usize, + } + } +} + +impl ActorCountPerParallelism { + pub fn exceed_hard_limit(&self) -> bool { + self.worker_id_to_actor_count + .values() + .any(|v| v.actor_count > self.hard_limit.saturating_mul(v.parallelism)) + } + + pub fn exceed_soft_limit(&self) -> bool { + self.worker_id_to_actor_count + .values() + .any(|v| v.actor_count > self.soft_limit.saturating_mul(v.parallelism)) + } + + pub fn exceed_limit(&self) -> bool { + self.exceed_soft_limit() || self.exceed_hard_limit() + } +} + +impl Display for ActorCountPerParallelism { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let worker_id_to_actor_count_str: Vec<_> = self + .worker_id_to_actor_count + .iter() + .map(|(k, v)| format!("{} -> {:?}", k, v)) + .collect(); + write!( + f, + "ActorCountPerParallelism {{ critical limit: {:?}, recommended limit: {:?}. worker_id_to_actor_count: {:?} }}", + self.hard_limit, self.soft_limit, worker_id_to_actor_count_str + ) + } +} diff --git a/src/common/src/util/mod.rs b/src/common/src/util/mod.rs index 20dac5906c91d..bfa15c8327037 100644 --- a/src/common/src/util/mod.rs +++ b/src/common/src/util/mod.rs @@ -42,3 +42,4 @@ pub mod tracing; pub mod value_encoding; pub mod worker_util; pub use tokio_util; +pub mod cluster_limit; diff --git a/src/config/example.toml b/src/config/example.toml index c81b35163eafa..f3c127cdc7825 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -81,6 +81,8 @@ meta_enable_trivial_move = true meta_enable_check_task_level_overlap = false meta_max_trivial_move_task_count_per_loop = 256 meta_max_get_task_probe_times = 5 +meta_actor_cnt_per_worker_parallelism_soft_limit = 100 +meta_actor_cnt_per_worker_parallelism_hard_limit = 400 [batch] enable_barrier_read = false diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs index 879e375e2b762..5e3261c06d186 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs @@ -59,3 +59,4 @@ mod rw_worker_nodes; mod rw_actor_id_to_ddl; mod rw_fragment_id_to_ddl; +mod rw_worker_actor_count; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs new file mode 100644 index 0000000000000..a336f69b2029f --- /dev/null +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs @@ -0,0 +1,31 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_common::types::Fields; +use risingwave_frontend_macro::system_catalog; + +#[system_catalog( + view, + "rw_catalog.rw_worker_actor_count", + "SELECT t2.id as worker_id, parallelism, count(*) as actor_count + FROM rw_actors t1, rw_worker_nodes t2 + where t1.worker_id = t2.id + GROUP BY t2.id, t2.parallelism;" +)] +#[derive(Fields)] +struct RwWorkerActorCount { + worker_id: i32, + parallelism: i32, + actor_count: i64, +} diff --git a/src/frontend/src/handler/create_mv.rs b/src/frontend/src/handler/create_mv.rs index 4399d80811c19..9d48f2a429cca 100644 --- a/src/frontend/src/handler/create_mv.rs +++ b/src/frontend/src/handler/create_mv.rs @@ -205,6 +205,9 @@ pub async fn handle_create_mv_bound( ) -> Result { let session = handler_args.session.clone(); + // Check cluster limits + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( name.clone(), StatementType::CREATE_MATERIALIZED_VIEW, diff --git a/src/frontend/src/handler/create_sink.rs b/src/frontend/src/handler/create_sink.rs index d0bd1d0cc8f2f..bb8d528ab1205 100644 --- a/src/frontend/src/handler/create_sink.rs +++ b/src/frontend/src/handler/create_sink.rs @@ -419,6 +419,8 @@ pub async fn handle_create_sink( ) -> Result { let session = handle_args.session.clone(); + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( stmt.sink_name.clone(), StatementType::CREATE_SINK, diff --git a/src/frontend/src/handler/create_table.rs b/src/frontend/src/handler/create_table.rs index a10453a43ea4e..386d50e791886 100644 --- a/src/frontend/src/handler/create_table.rs +++ b/src/frontend/src/handler/create_table.rs @@ -1235,6 +1235,8 @@ pub async fn handle_create_table( session.notice_to_user("APPEND ONLY TABLE is currently an experimental feature."); } + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( table_name.clone(), StatementType::CREATE_TABLE, diff --git a/src/frontend/src/meta_client.rs b/src/frontend/src/meta_client.rs index 60fa992bdbe2d..020e3380b29b7 100644 --- a/src/frontend/src/meta_client.rs +++ b/src/frontend/src/meta_client.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use anyhow::Context; use risingwave_common::session_config::SessionConfig; use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::util::cluster_limit::ClusterLimit; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionDelta}; use risingwave_hummock_sdk::HummockVersionId; use risingwave_pb::backup_service::MetaSnapshotMetadata; @@ -136,6 +137,8 @@ pub trait FrontendMetaClient: Send + Sync { ) -> Result>; async fn get_cluster_recovery_status(&self) -> Result; + + async fn get_cluster_limits(&self) -> Result>; } pub struct FrontendMetaClientImpl(pub MetaClient); @@ -345,4 +348,8 @@ impl FrontendMetaClient for FrontendMetaClientImpl { async fn get_cluster_recovery_status(&self) -> Result { self.0.get_cluster_recovery_status().await } + + async fn get_cluster_limits(&self) -> Result> { + self.0.get_cluster_limits().await + } } diff --git a/src/frontend/src/session.rs b/src/frontend/src/session.rs index 16f0c7226be21..a1150798951cb 100644 --- a/src/frontend/src/session.rs +++ b/src/frontend/src/session.rs @@ -59,9 +59,10 @@ use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::telemetry_env_enabled; use risingwave_common::types::DataType; use risingwave_common::util::addr::HostAddr; +use risingwave_common::util::cluster_limit::ActorCountPerParallelism; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_common::util::resource_util; use risingwave_common::util::runtime::BackgroundShutdownRuntime; +use risingwave_common::util::{cluster_limit, resource_util}; use risingwave_common::{GIT_SHA, RW_VERSION}; use risingwave_common_heap_profiling::HeapProfiler; use risingwave_common_service::{MetricsManager, ObserverManager}; @@ -1194,6 +1195,47 @@ impl SessionImpl { pub fn temporary_source_manager(&self) -> TemporarySourceManager { self.temporary_source_manager.lock().clone() } + + pub async fn check_cluster_limits(&self) -> Result<()> { + if self.config().bypass_cluster_limits() { + return Ok(()); + } + + let gen_message = |violated_limit: &ActorCountPerParallelism, + exceed_hard_limit: bool| + -> String { + let (limit_type, action) = if exceed_hard_limit { + ("critical", "Please scale the cluster before proceeding!") + } else { + ("recommended", "Scaling the cluster is recommended.") + }; + format!( + "\n- {}\n- {}\n- {}\n- {}\n- {}\n{}", + format_args!("Actor count per parallelism exceeds the {} limit.", limit_type), + format_args!("Depending on your workload, this may overload the cluster and cause performance/stability issues. {}", action), + "Contact us via slack or https://risingwave.com/contact-us/ for further enquiry.", + "You can bypass this check via SQL `SET bypass_cluster_limits TO true`.", + "You can check actor count distribution via SQL `SELECT * FROM rw_worker_actor_count`.", + violated_limit, + ) + }; + + let limits = self.env().meta_client().get_cluster_limits().await?; + for limit in limits { + match limit { + cluster_limit::ClusterLimit::ActorCount(l) => { + if l.exceed_hard_limit() { + return Err(RwError::from(ErrorCode::ProtocolError(gen_message( + &l, true, + )))); + } else if l.exceed_soft_limit() { + self.notice_to_user(gen_message(&l, false)); + } + } + } + } + Ok(()) + } } pub static SESSION_MANAGER: std::sync::OnceLock> = diff --git a/src/frontend/src/test_utils.rs b/src/frontend/src/test_utils.rs index ee6ff589e0cdb..10dad2105ed94 100644 --- a/src/frontend/src/test_utils.rs +++ b/src/frontend/src/test_utils.rs @@ -30,6 +30,7 @@ use risingwave_common::catalog::{ }; use risingwave_common::session_config::SessionConfig; use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::util::cluster_limit::ClusterLimit; use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionDelta}; use risingwave_pb::backup_service::MetaSnapshotMetadata; @@ -1065,7 +1066,7 @@ impl FrontendMetaClient for MockFrontendMetaClient { } async fn list_all_nodes(&self) -> RpcResult> { - unimplemented!() + Ok(vec![]) } async fn list_compact_task_progress(&self) -> RpcResult> { @@ -1097,6 +1098,10 @@ impl FrontendMetaClient for MockFrontendMetaClient { ) -> RpcResult> { unimplemented!() } + + async fn get_cluster_limits(&self) -> RpcResult> { + Ok(vec![]) + } } #[cfg(test)] diff --git a/src/meta/node/src/lib.rs b/src/meta/node/src/lib.rs index 049519372c81e..88a76d1a1c706 100644 --- a/src/meta/node/src/lib.rs +++ b/src/meta/node/src/lib.rs @@ -457,6 +457,14 @@ pub fn start( table_info_statistic_history_times: config .storage .table_info_statistic_history_times, + actor_cnt_per_worker_parallelism_hard_limit: config + .meta + .developer + .actor_cnt_per_worker_parallelism_hard_limit, + actor_cnt_per_worker_parallelism_soft_limit: config + .meta + .developer + .actor_cnt_per_worker_parallelism_soft_limit, }, config.system.into_init_system_params(), Default::default(), diff --git a/src/meta/node/src/server.rs b/src/meta/node/src/server.rs index 1f0f7f6a3fe8e..87c429ed9ccd3 100644 --- a/src/meta/node/src/server.rs +++ b/src/meta/node/src/server.rs @@ -40,6 +40,7 @@ use risingwave_meta::stream::ScaleController; use risingwave_meta::MetaStoreBackend; use risingwave_meta_service::backup_service::BackupServiceImpl; use risingwave_meta_service::cloud_service::CloudServiceImpl; +use risingwave_meta_service::cluster_limit_service::ClusterLimitServiceImpl; use risingwave_meta_service::cluster_service::ClusterServiceImpl; use risingwave_meta_service::ddl_service::DdlServiceImpl; use risingwave_meta_service::event_log_service::EventLogServiceImpl; @@ -63,6 +64,7 @@ use risingwave_pb::connector_service::sink_coordination_service_server::SinkCoor use risingwave_pb::ddl_service::ddl_service_server::DdlServiceServer; use risingwave_pb::health::health_server::HealthServer; use risingwave_pb::hummock::hummock_manager_service_server::HummockManagerServiceServer; +use risingwave_pb::meta::cluster_limit_service_server::ClusterLimitServiceServer; use risingwave_pb::meta::cluster_service_server::ClusterServiceServer; use risingwave_pb::meta::event_log_service_server::EventLogServiceServer; use risingwave_pb::meta::heartbeat_service_server::HeartbeatServiceServer; @@ -657,6 +659,7 @@ pub async fn start_service_as_election_leader( ServingServiceImpl::new(serving_vnode_mapping.clone(), metadata_manager.clone()); let cloud_srv = CloudServiceImpl::new(metadata_manager.clone(), aws_cli); let event_log_srv = EventLogServiceImpl::new(env.event_log_manager_ref()); + let cluster_limit_srv = ClusterLimitServiceImpl::new(env.clone(), metadata_manager.clone()); if let Some(prometheus_addr) = address_info.prometheus_addr { MetricsManager::boot_metrics_service(prometheus_addr.to_string()) @@ -795,7 +798,8 @@ pub async fn start_service_as_election_leader( .add_service(ServingServiceServer::new(serving_srv)) .add_service(CloudServiceServer::new(cloud_srv)) .add_service(SinkCoordinationServiceServer::new(sink_coordination_srv)) - .add_service(EventLogServiceServer::new(event_log_srv)); + .add_service(EventLogServiceServer::new(event_log_srv)) + .add_service(ClusterLimitServiceServer::new(cluster_limit_srv)); #[cfg(not(madsim))] // `otlp-embedded` does not use madsim-patched tonic let server_builder = server_builder.add_service(TraceServiceServer::new(trace_srv)); diff --git a/src/meta/service/src/cluster_limit_service.rs b/src/meta/service/src/cluster_limit_service.rs new file mode 100644 index 0000000000000..df19b24b234e6 --- /dev/null +++ b/src/meta/service/src/cluster_limit_service.rs @@ -0,0 +1,107 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use risingwave_common::util::cluster_limit::{ + ActorCountPerParallelism, ClusterLimit, WorkerActorCount, +}; +use risingwave_meta::manager::{MetaSrvEnv, MetadataManager, WorkerId}; +use risingwave_meta::MetaResult; +use risingwave_pb::common::worker_node::State; +use risingwave_pb::common::WorkerType; +use risingwave_pb::meta::cluster_limit_service_server::ClusterLimitService; +use risingwave_pb::meta::{GetClusterLimitsRequest, GetClusterLimitsResponse}; +use tonic::{Request, Response, Status}; + +#[derive(Clone)] +pub struct ClusterLimitServiceImpl { + env: MetaSrvEnv, + metadata_manager: MetadataManager, +} + +impl ClusterLimitServiceImpl { + pub fn new(env: MetaSrvEnv, metadata_manager: MetadataManager) -> Self { + ClusterLimitServiceImpl { + env, + metadata_manager, + } + } + + async fn get_active_actor_limit(&self) -> MetaResult> { + let (soft_limit, hard_limit) = ( + self.env.opts.actor_cnt_per_worker_parallelism_soft_limit, + self.env.opts.actor_cnt_per_worker_parallelism_hard_limit, + ); + + let running_worker_parallelism: HashMap = self + .metadata_manager + .list_worker_node(Some(WorkerType::ComputeNode), Some(State::Running)) + .await? + .into_iter() + .map(|e| (e.id, e.parallelism())) + .collect(); + let worker_actor_count: HashMap = self + .metadata_manager + .worker_actor_count() + .await? + .into_iter() + .filter_map(|(worker_id, actor_count)| { + running_worker_parallelism + .get(&worker_id) + .map(|parallelism| { + ( + worker_id, + WorkerActorCount { + actor_count, + parallelism: *parallelism, + }, + ) + }) + }) + .collect(); + + let limit = ActorCountPerParallelism { + worker_id_to_actor_count: worker_actor_count, + hard_limit, + soft_limit, + }; + + if limit.exceed_limit() { + Ok(Some(ClusterLimit::ActorCount(limit))) + } else { + Ok(None) + } + } +} + +#[async_trait::async_trait] +impl ClusterLimitService for ClusterLimitServiceImpl { + #[cfg_attr(coverage, coverage(off))] + async fn get_cluster_limits( + &self, + _request: Request, + ) -> Result, Status> { + // TODO: support more limits + match self.get_active_actor_limit().await { + Ok(Some(limit)) => Ok(Response::new(GetClusterLimitsResponse { + active_limits: vec![limit.into()], + })), + Ok(None) => Ok(Response::new(GetClusterLimitsResponse { + active_limits: vec![], + })), + Err(e) => Err(e.into()), + } + } +} diff --git a/src/meta/service/src/lib.rs b/src/meta/service/src/lib.rs index 9ab248802772e..e2f57d4a26bbb 100644 --- a/src/meta/service/src/lib.rs +++ b/src/meta/service/src/lib.rs @@ -21,6 +21,7 @@ use risingwave_meta::*; pub mod backup_service; pub mod cloud_service; +pub mod cluster_limit_service; pub mod cluster_service; pub mod ddl_service; pub mod event_log_service; diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs index 22f88bd9c0a75..ed18be6b0f483 100644 --- a/src/meta/src/manager/env.rs +++ b/src/meta/src/manager/env.rs @@ -294,6 +294,10 @@ pub struct MetaOpts { pub temp_secret_file_dir: String, pub table_info_statistic_history_times: usize, + + // Cluster limits + pub actor_cnt_per_worker_parallelism_hard_limit: usize, + pub actor_cnt_per_worker_parallelism_soft_limit: usize, } impl MetaOpts { @@ -358,6 +362,8 @@ impl MetaOpts { secret_store_private_key: Some("0123456789abcdef".as_bytes().to_vec()), temp_secret_file_dir: "./secrets".to_string(), table_info_statistic_history_times: 240, + actor_cnt_per_worker_parallelism_hard_limit: usize::MAX, + actor_cnt_per_worker_parallelism_soft_limit: usize::MAX, } } } @@ -408,9 +414,11 @@ impl MetaSrvEnv { (ClusterId::new(), true) }; - // For new clusters, the name of the object store needs to be prefixed according to the object id. - // For old clusters, the prefix is ​​not divided for the sake of compatibility. - + // For new clusters: + // - the name of the object store needs to be prefixed according to the object id. + // + // For old clusters + // - the prefix is ​​not divided for the sake of compatibility. init_system_params.use_new_object_prefix_strategy = Some(cluster_first_launch); let system_params_manager = Arc::new( SystemParamsManager::new( @@ -455,7 +463,7 @@ impl MetaSrvEnv { } } MetaStoreImpl::Sql(sql_meta_store) => { - let is_sql_backend_cluster_first_launch = + let cluster_first_launch = is_first_launch_for_sql_backend_cluster(sql_meta_store).await?; // Try to upgrade if any new model changes are added. Migrator::up(&sql_meta_store.conn, None) @@ -469,10 +477,14 @@ impl MetaSrvEnv { .await? .map(|c| c.cluster_id.to_string().into()) .unwrap(); - init_system_params.use_new_object_prefix_strategy = - Some(is_sql_backend_cluster_first_launch); - // For new clusters, the name of the object store needs to be prefixed according to the object id. - // For old clusters, the prefix is ​​not divided for the sake of compatibility. + + // For new clusters: + // - the name of the object store needs to be prefixed according to the object id. + // + // For old clusters + // - the prefix is ​​not divided for the sake of compatibility. + init_system_params.use_new_object_prefix_strategy = Some(cluster_first_launch); + let system_param_controller = Arc::new( SystemParamsController::new( sql_meta_store.clone(), diff --git a/src/rpc_client/src/meta_client.rs b/src/rpc_client/src/meta_client.rs index b4e06d8690b72..db66e60c91eeb 100644 --- a/src/rpc_client/src/meta_client.rs +++ b/src/rpc_client/src/meta_client.rs @@ -22,6 +22,7 @@ use std::time::{Duration, SystemTime}; use anyhow::{anyhow, Context}; use async_trait::async_trait; +use cluster_limit_service_client::ClusterLimitServiceClient; use either::Either; use futures::stream::BoxStream; use lru::LruCache; @@ -1436,6 +1437,14 @@ impl MetaClient { let resp = self.inner.get_version_by_epoch(req).await?; Ok(resp.version.unwrap()) } + + pub async fn get_cluster_limits( + &self, + ) -> Result> { + let req = GetClusterLimitsRequest {}; + let resp = self.inner.get_cluster_limits(req).await?; + Ok(resp.active_limits.into_iter().map(|l| l.into()).collect()) + } } #[async_trait] @@ -1636,6 +1645,7 @@ struct GrpcMetaClientCore { cloud_client: CloudServiceClient, sink_coordinate_client: SinkCoordinationRpcClient, event_log_client: EventLogServiceClient, + cluster_limit_client: ClusterLimitServiceClient, } impl GrpcMetaClientCore { @@ -1662,7 +1672,8 @@ impl GrpcMetaClientCore { let serving_client = ServingServiceClient::new(channel.clone()); let cloud_client = CloudServiceClient::new(channel.clone()); let sink_coordinate_client = SinkCoordinationServiceClient::new(channel.clone()); - let event_log_client = EventLogServiceClient::new(channel); + let event_log_client = EventLogServiceClient::new(channel.clone()); + let cluster_limit_client = ClusterLimitServiceClient::new(channel); GrpcMetaClientCore { cluster_client, @@ -1682,6 +1693,7 @@ impl GrpcMetaClientCore { cloud_client, sink_coordinate_client, event_log_client, + cluster_limit_client, } } } @@ -2126,6 +2138,7 @@ macro_rules! for_all_meta_rpc { ,{ cloud_client, rw_cloud_validate_source, RwCloudValidateSourceRequest, RwCloudValidateSourceResponse } ,{ event_log_client, list_event_log, ListEventLogRequest, ListEventLogResponse } ,{ event_log_client, add_event_log, AddEventLogRequest, AddEventLogResponse } + ,{ cluster_limit_client, get_cluster_limits, GetClusterLimitsRequest, GetClusterLimitsResponse } } }; } From 7d3f2856f2466681346c2edb6dfe890e0c41c5e2 Mon Sep 17 00:00:00 2001 From: William Wen <44139337+wenym1@users.noreply.github.com> Date: Fri, 6 Sep 2024 15:31:07 +0800 Subject: [PATCH 14/32] refactor: remove drop_actors rpc and simplify barrier worker state (#18354) --- proto/stream_service.proto | 11 - src/compute/src/rpc/service/stream_service.rs | 14 - src/meta/node/src/server.rs | 6 - src/meta/src/barrier/command.rs | 32 --- src/meta/src/barrier/mod.rs | 5 - src/meta/src/barrier/recovery.rs | 8 + src/meta/src/barrier/rpc.rs | 99 +------ src/meta/src/stream/scale.rs | 6 +- src/meta/src/stream/stream_manager.rs | 17 +- src/rpc_client/src/stream_client.rs | 3 +- src/stream/src/executor/dispatch.rs | 47 ++-- src/stream/src/executor/integration_tests.rs | 20 +- src/stream/src/executor/merge.rs | 54 ++-- src/stream/src/executor/receiver.rs | 53 ++-- src/stream/src/task/barrier_manager.rs | 98 ++----- .../src/task/barrier_manager/managed_state.rs | 242 ++++++++++++------ src/stream/src/task/barrier_manager/tests.rs | 33 ++- src/stream/src/task/mod.rs | 4 +- src/stream/src/task/stream_manager.rs | 140 ++-------- 19 files changed, 338 insertions(+), 554 deletions(-) diff --git a/proto/stream_service.proto b/proto/stream_service.proto index 54ffc3d5ff79c..ce727ba9cc55c 100644 --- a/proto/stream_service.proto +++ b/proto/stream_service.proto @@ -17,16 +17,6 @@ message BuildActorInfo { map related_subscriptions = 2; } -message DropActorsRequest { - string request_id = 1; - repeated uint32 actor_ids = 2; -} - -message DropActorsResponse { - string request_id = 1; - common.Status status = 2; -} - message InjectBarrierRequest { string request_id = 1; stream_plan.Barrier barrier = 2; @@ -109,7 +99,6 @@ message StreamingControlStreamResponse { } service StreamService { - rpc DropActors(DropActorsRequest) returns (DropActorsResponse); rpc WaitEpochCommit(WaitEpochCommitRequest) returns (WaitEpochCommitResponse); rpc StreamingControlStream(stream StreamingControlStreamRequest) returns (stream StreamingControlStreamResponse); } diff --git a/src/compute/src/rpc/service/stream_service.rs b/src/compute/src/rpc/service/stream_service.rs index eb055a174b3ea..6253cfe74c730 100644 --- a/src/compute/src/rpc/service/stream_service.rs +++ b/src/compute/src/rpc/service/stream_service.rs @@ -40,20 +40,6 @@ impl StreamService for StreamServiceImpl { type StreamingControlStreamStream = impl Stream>; - #[cfg_attr(coverage, coverage(off))] - async fn drop_actors( - &self, - request: Request, - ) -> std::result::Result, Status> { - let req = request.into_inner(); - let actors = req.actor_ids; - self.mgr.drop_actors(actors).await?; - Ok(Response::new(DropActorsResponse { - request_id: req.request_id, - status: None, - })) - } - #[cfg_attr(coverage, coverage(off))] async fn wait_epoch_commit( &self, diff --git a/src/meta/node/src/server.rs b/src/meta/node/src/server.rs index 87c429ed9ccd3..11b22014f9f98 100644 --- a/src/meta/node/src/server.rs +++ b/src/meta/node/src/server.rs @@ -27,7 +27,6 @@ use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::{report_scarf_enabled, report_to_scarf, telemetry_env_enabled}; use risingwave_common::util::tokio_util::sync::CancellationToken; use risingwave_common_service::{MetricsManager, TracingExtractLayer}; -use risingwave_meta::barrier::StreamRpcManager; use risingwave_meta::controller::catalog::CatalogController; use risingwave_meta::controller::cluster::ClusterController; use risingwave_meta::manager::{ @@ -552,12 +551,9 @@ pub async fn start_service_as_election_leader( // TODO(shutdown): remove this as there's no need to gracefully shutdown some of these sub-tasks. let mut sub_tasks = vec![shutdown_handle]; - let stream_rpc_manager = StreamRpcManager::new(env.clone()); - let scale_controller = Arc::new(ScaleController::new( &metadata_manager, source_manager.clone(), - stream_rpc_manager.clone(), env.clone(), )); @@ -569,7 +565,6 @@ pub async fn start_service_as_election_leader( source_manager.clone(), sink_manager.clone(), meta_metrics.clone(), - stream_rpc_manager.clone(), scale_controller.clone(), ) .await; @@ -587,7 +582,6 @@ pub async fn start_service_as_election_leader( metadata_manager.clone(), barrier_scheduler.clone(), source_manager.clone(), - stream_rpc_manager, scale_controller.clone(), ) .unwrap(), diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index a1c82ccd8db83..577a0bef25360 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -16,7 +16,6 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Formatter; use futures::future::try_join_all; -use itertools::Itertools; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; use risingwave_common::hash::ActorMapping; @@ -961,19 +960,6 @@ impl Command { } impl CommandContext { - /// Clean up actors in CNs if needed, used by drop, cancel and reschedule commands. - async fn clean_up(&self, actors: Vec) -> MetaResult<()> { - self.barrier_manager_context - .stream_rpc_manager - .drop_actors( - &self.node_map, - self.node_map - .keys() - .map(|worker_id| (*worker_id, actors.clone())), - ) - .await - } - pub async fn wait_epoch_commit(&self, epoch: HummockEpoch) -> MetaResult<()> { let futures = self.node_map.values().map(|worker_node| async { let client = self @@ -1023,13 +1009,9 @@ impl CommandContext { } Command::DropStreamingJobs { - actors, unregistered_state_table_ids, .. } => { - // Tell compute nodes to drop actors. - self.clean_up(actors.clone()).await?; - self.barrier_manager_context .hummock_manager .unregister_table_ids(unregistered_state_table_ids.iter().cloned()) @@ -1038,7 +1020,6 @@ impl CommandContext { Command::CancelStreamingJob(table_fragments) => { tracing::debug!(id = ?table_fragments.table_id(), "cancelling stream job"); - self.clean_up(table_fragments.actor_ids()).await?; // NOTE(kwannoel): At this point, meta has already registered the table ids. // We should unregister them. @@ -1138,8 +1119,6 @@ impl CommandContext { .. }) = job_type { - self.clean_up(old_table_fragments.actor_ids()).await?; - // Drop fragment info in meta store. mgr.fragment_manager .post_replace_table( @@ -1166,13 +1145,9 @@ impl CommandContext { new_table_fragments, dispatchers, init_split_assignment, - old_table_fragments, .. }) = job_type { - // Tell compute nodes to drop actors. - self.clean_up(old_table_fragments.actor_ids()).await?; - mgr.catalog_controller .post_collect_table_fragments( new_table_fragments.table_id().table_id as _, @@ -1203,11 +1178,6 @@ impl CommandContext { table_parallelism, .. } => { - let removed_actors = reschedules - .values() - .flat_map(|reschedule| reschedule.removed_actors.clone().into_iter()) - .collect_vec(); - self.clean_up(removed_actors).await?; self.barrier_manager_context .scale_controller .post_apply_reschedule(reschedules, table_parallelism) @@ -1222,8 +1192,6 @@ impl CommandContext { init_split_assignment, .. }) => { - self.clean_up(old_table_fragments.actor_ids()).await?; - match &self.barrier_manager_context.metadata_manager { MetadataManager::V1(mgr) => { // Drop fragment info in meta store. diff --git a/src/meta/src/barrier/mod.rs b/src/meta/src/barrier/mod.rs index daa82306bff6d..e4b9cdb8b3a97 100644 --- a/src/meta/src/barrier/mod.rs +++ b/src/meta/src/barrier/mod.rs @@ -86,7 +86,6 @@ pub use self::command::{ Reschedule, SnapshotBackfillInfo, }; pub use self::info::InflightSubscriptionInfo; -pub use self::rpc::StreamRpcManager; pub use self::schedule::BarrierScheduler; pub use self::trace::TracedEpoch; @@ -172,8 +171,6 @@ pub struct GlobalBarrierManagerContext { pub(super) metrics: Arc, - stream_rpc_manager: StreamRpcManager, - env: MetaSrvEnv, } @@ -596,7 +593,6 @@ impl GlobalBarrierManager { source_manager: SourceManagerRef, sink_manager: SinkCoordinatorManager, metrics: Arc, - stream_rpc_manager: StreamRpcManager, scale_controller: ScaleControllerRef, ) -> Self { let enable_recovery = env.opts.enable_recovery; @@ -624,7 +620,6 @@ impl GlobalBarrierManager { scale_controller, sink_manager, metrics, - stream_rpc_manager, env: env.clone(), }; diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs index 25fe1fd2ceff7..63cd4c16d9aaf 100644 --- a/src/meta/src/barrier/recovery.rs +++ b/src/meta/src/barrier/recovery.rs @@ -1121,6 +1121,14 @@ impl GlobalBarrierManagerContext { return Err(anyhow!("actors dropped during update").into()); } + { + for (node_id, actors) in &info.actor_map { + if !actors.is_empty() && !all_node_actors.contains_key(node_id) { + return Err(anyhow!("streaming job dropped during update").into()); + } + } + } + Ok(all_node_actors) } } diff --git a/src/meta/src/barrier/rpc.rs b/src/meta/src/barrier/rpc.rs index 14ee8b0c15f7b..1e7d9b5dfa759 100644 --- a/src/meta/src/barrier/rpc.rs +++ b/src/meta/src/barrier/rpc.rs @@ -21,7 +21,7 @@ use anyhow::anyhow; use fail::fail_point; use futures::future::try_join_all; use futures::stream::{BoxStream, FuturesUnordered}; -use futures::{pin_mut, FutureExt, StreamExt}; +use futures::{FutureExt, StreamExt}; use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::hash::ActorId; @@ -34,11 +34,9 @@ use risingwave_pb::stream_service::build_actor_info::SubscriptionIds; use risingwave_pb::stream_service::streaming_control_stream_request::RemovePartialGraphRequest; use risingwave_pb::stream_service::{ streaming_control_stream_request, streaming_control_stream_response, BarrierCompleteResponse, - BuildActorInfo, DropActorsRequest, InjectBarrierRequest, StreamingControlStreamRequest, + BuildActorInfo, InjectBarrierRequest, StreamingControlStreamRequest, StreamingControlStreamResponse, }; -use risingwave_rpc_client::error::RpcError; -use risingwave_rpc_client::StreamClient; use rw_futures_util::pending_on_none; use thiserror_ext::AsReport; use tokio::sync::mpsc::UnboundedSender; @@ -50,7 +48,7 @@ use uuid::Uuid; use super::command::CommandContext; use super::{BarrierKind, GlobalBarrierManagerContext, TracedEpoch}; use crate::barrier::info::InflightGraphInfo; -use crate::manager::{MetaSrvEnv, WorkerId}; +use crate::manager::WorkerId; use crate::{MetaError, MetaResult}; const COLLECT_ERROR_TIMEOUT: Duration = Duration::from_secs(3); @@ -393,7 +391,7 @@ impl ControlStreamManager { request: Some( streaming_control_stream_request::Request::InjectBarrier( InjectBarrierRequest { - request_id: StreamRpcManager::new_request_id(), + request_id: Uuid::new_v4().to_string(), barrier: Some(barrier), actor_ids_to_collect, table_ids_to_sync, @@ -512,95 +510,6 @@ impl GlobalBarrierManagerContext { } } -#[derive(Clone)] -pub struct StreamRpcManager { - env: MetaSrvEnv, -} - -impl StreamRpcManager { - pub fn new(env: MetaSrvEnv) -> Self { - Self { env } - } - - async fn make_request> + 'static>( - &self, - request: impl Iterator, - f: impl Fn(StreamClient, REQ) -> Fut, - ) -> MetaResult> { - let pool = self.env.stream_client_pool(); - let f = &f; - let iters = request.map(|(node, input)| async move { - let client = pool.get(node).await.map_err(|e| (node.id, e))?; - f(client, input).await.map_err(|e| (node.id, e)) - }); - let result = try_join_all_with_error_timeout(iters, COLLECT_ERROR_TIMEOUT).await; - result.map_err(|results_err| merge_node_rpc_errors("merged RPC Error", results_err)) - } - - fn new_request_id() -> String { - Uuid::new_v4().to_string() - } - - pub async fn drop_actors( - &self, - node_map: &HashMap, - node_actors: impl Iterator)>, - ) -> MetaResult<()> { - self.make_request( - node_actors - .map(|(worker_id, actor_ids)| (node_map.get(&worker_id).unwrap(), actor_ids)), - |client, actor_ids| async move { - client - .drop_actors(DropActorsRequest { - request_id: Self::new_request_id(), - actor_ids, - }) - .await - }, - ) - .await?; - Ok(()) - } -} - -/// This function is similar to `try_join_all`, but it attempts to collect as many error as possible within `error_timeout`. -async fn try_join_all_with_error_timeout( - iters: I, - error_timeout: Duration, -) -> Result, Vec> -where - I: IntoIterator, - F: Future>, -{ - let stream = FuturesUnordered::from_iter(iters); - pin_mut!(stream); - let mut results_ok = vec![]; - let mut results_err = vec![]; - while let Some(result) = stream.next().await { - match result { - Ok(rsp) => { - results_ok.push(rsp); - } - Err(err) => { - results_err.push(err); - break; - } - } - } - if results_err.is_empty() { - return Ok(results_ok); - } - let _ = timeout(error_timeout, async { - while let Some(result) = stream.next().await { - if let Err(err) = result { - results_err.push(err); - } - } - }) - .await; - Err(results_err) -} - pub(super) fn merge_node_rpc_errors( message: &str, errors: impl IntoIterator, diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index 8e9b5be690439..5fc916ceaaa48 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -49,7 +49,7 @@ use tokio::sync::{oneshot, RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::task::JoinHandle; use tokio::time::{Instant, MissedTickBehavior}; -use crate::barrier::{Command, Reschedule, StreamRpcManager}; +use crate::barrier::{Command, Reschedule}; use crate::controller::scale::RescheduleWorkingSet; use crate::manager::{ IdCategory, IdGenManagerImpl, LocalNotification, MetaSrvEnv, MetadataManager, @@ -437,8 +437,6 @@ pub struct ScaleController { pub source_manager: SourceManagerRef, - pub stream_rpc_manager: StreamRpcManager, - pub env: MetaSrvEnv, /// We will acquire lock during DDL to prevent scaling operations on jobs that are in the creating state. @@ -450,11 +448,9 @@ impl ScaleController { pub fn new( metadata_manager: &MetadataManager, source_manager: SourceManagerRef, - stream_rpc_manager: StreamRpcManager, env: MetaSrvEnv, ) -> Self { Self { - stream_rpc_manager, metadata_manager: metadata_manager.clone(), source_manager, env, diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index d8b1dc131fec1..5dc174106197c 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -31,7 +31,7 @@ use tracing::Instrument; use super::{Locations, RescheduleOptions, ScaleControllerRef, TableResizePolicy}; use crate::barrier::{ BarrierScheduler, Command, CreateStreamingJobCommandInfo, CreateStreamingJobType, - ReplaceTablePlan, SnapshotBackfillInfo, StreamRpcManager, + ReplaceTablePlan, SnapshotBackfillInfo, }; use crate::manager::{DdlType, MetaSrvEnv, MetadataManager, NotificationVersion, StreamingJob}; use crate::model::{ActorId, FragmentId, MetadataModel, TableFragments, TableParallelism}; @@ -203,8 +203,6 @@ pub struct GlobalStreamManager { creating_job_info: CreatingStreamingJobInfoRef, pub scale_controller: ScaleControllerRef, - - pub stream_rpc_manager: StreamRpcManager, } impl GlobalStreamManager { @@ -213,7 +211,6 @@ impl GlobalStreamManager { metadata_manager: MetadataManager, barrier_scheduler: BarrierScheduler, source_manager: SourceManagerRef, - stream_rpc_manager: StreamRpcManager, scale_controller: ScaleControllerRef, ) -> MetaResult { Ok(Self { @@ -223,7 +220,6 @@ impl GlobalStreamManager { source_manager, creating_job_info: Arc::new(CreatingStreamingJobInfo::default()), scale_controller, - stream_rpc_manager, }) } @@ -815,13 +811,6 @@ mod tests { type StreamingControlStreamStream = impl Stream>; - async fn drop_actors( - &self, - _request: Request, - ) -> std::result::Result, Status> { - Ok(Response::new(DropActorsResponse::default())) - } - async fn streaming_control_stream( &self, request: Request>, @@ -988,11 +977,9 @@ mod tests { let (sink_manager, _) = SinkCoordinatorManager::start_worker(); - let stream_rpc_manager = StreamRpcManager::new(env.clone()); let scale_controller = Arc::new(ScaleController::new( &metadata_manager, source_manager.clone(), - stream_rpc_manager.clone(), env.clone(), )); @@ -1004,7 +991,6 @@ mod tests { source_manager.clone(), sink_manager, meta_metrics.clone(), - stream_rpc_manager.clone(), scale_controller.clone(), ) .await; @@ -1014,7 +1000,6 @@ mod tests { metadata_manager, barrier_scheduler.clone(), source_manager.clone(), - stream_rpc_manager, scale_controller.clone(), )?; diff --git a/src/rpc_client/src/stream_client.rs b/src/rpc_client/src/stream_client.rs index 920b6f0777f37..40a6d48dacb37 100644 --- a/src/rpc_client/src/stream_client.rs +++ b/src/rpc_client/src/stream_client.rs @@ -70,8 +70,7 @@ pub type StreamClientPoolRef = Arc; macro_rules! for_all_stream_rpc { ($macro:ident) => { $macro! { - { 0, drop_actors, DropActorsRequest, DropActorsResponse } - ,{ 0, wait_epoch_commit, WaitEpochCommitRequest, WaitEpochCommitResponse } + { 0, wait_epoch_commit, WaitEpochCommitRequest, WaitEpochCommitResponse } } }; } diff --git a/src/stream/src/executor/dispatch.rs b/src/stream/src/executor/dispatch.rs index 4a43ff618ebf7..0fc9da0e5ab23 100644 --- a/src/stream/src/executor/dispatch.rs +++ b/src/stream/src/executor/dispatch.rs @@ -1226,6 +1226,32 @@ mod tests { ) .unwrap(); + let dispatcher_updates = maplit::hashmap! { + actor_id => vec![PbDispatcherUpdate { + actor_id, + dispatcher_id: broadcast_dispatcher_id, + added_downstream_actor_id: vec![new], + removed_downstream_actor_id: vec![old], + hash_mapping: Default::default(), + }] + }; + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: dispatcher_updates, + merges: Default::default(), + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let executor = Box::new(DispatchExecutor::new( input, vec![broadcast_dispatcher, simple_dispatcher], @@ -1254,27 +1280,6 @@ mod tests { .await .unwrap(); - // 4. Send a configuration change barrier for broadcast dispatcher. - let dispatcher_updates = maplit::hashmap! { - actor_id => vec![PbDispatcherUpdate { - actor_id, - dispatcher_id: broadcast_dispatcher_id, - added_downstream_actor_id: vec![new], - removed_downstream_actor_id: vec![old], - hash_mapping: Default::default(), - }] - }; - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: dispatcher_updates, - merges: Default::default(), - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - barrier_test_env.inject_barrier(&b1, [actor_id]); tx.send(Message::Barrier(b1.clone().into_dispatcher())) .await .unwrap(); diff --git a/src/stream/src/executor/integration_tests.rs b/src/stream/src/executor/integration_tests.rs index d65abc5a5ce53..13e9a67d1c525 100644 --- a/src/stream/src/executor/integration_tests.rs +++ b/src/stream/src/executor/integration_tests.rs @@ -14,6 +14,8 @@ use std::sync::Mutex; +use futures::future::BoxFuture; +use futures::FutureExt; use futures_async_stream::try_stream; use multimap::MultiMap; use risingwave_common::array::*; @@ -100,7 +102,7 @@ async fn test_merger_sum_aggr() { }; // join handles of all actors - let mut handles = vec![]; + let mut actor_futures: Vec> = vec![]; // input and output channels of the local aggregation actors let mut inputs = vec![]; @@ -113,7 +115,7 @@ async fn test_merger_sum_aggr() { let (tx, rx) = channel_for_test(); let (actor, channel) = make_actor(rx); outputs.push(channel); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); inputs.push(Box::new(LocalOutput::new(233, tx)) as BoxedOutput); } @@ -154,7 +156,7 @@ async fn test_merger_sum_aggr() { .local_barrier_manager .clone(), ); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); let actor_ctx = ActorContext::for_test(gen_next_actor_id()); @@ -225,11 +227,21 @@ async fn test_merger_sum_aggr() { .local_barrier_manager .clone(), ); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); let mut epoch = test_epoch(1); let b1 = Barrier::new_test_barrier(epoch); barrier_test_env.inject_barrier(&b1, actors.clone()); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let handles = actor_futures + .into_iter() + .map(|actor_future| tokio::spawn(actor_future)) + .collect_vec(); + input .send(Message::Barrier(b1.into_dispatcher())) .await diff --git a/src/stream/src/executor/merge.rs b/src/stream/src/executor/merge.rs index 393b800895151..d45d75604fa57 100644 --- a/src/stream/src/executor/merge.rs +++ b/src/stream/src/executor/merge.rs @@ -531,6 +531,11 @@ mod tests { let b2 = Barrier::with_prev_epoch_for_test(test_epoch(1000), *prev_epoch) .with_mutation(Mutation::Stop(HashSet::default())); barrier_test_env.inject_barrier(&b2, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; for (tx_id, tx) in txs.into_iter().enumerate() { let epochs = epochs.clone(); @@ -634,6 +639,33 @@ mod tests { .try_collect() .unwrap(); + let merge_updates = maplit::hashmap! { + (actor_id, upstream_fragment_id) => MergeUpdate { + actor_id, + upstream_fragment_id, + new_upstream_fragment_id: None, + added_upstream_actor_id: vec![new], + removed_upstream_actor_id: vec![old], + } + }; + + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: Default::default(), + merges: merge_updates, + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let mut merge = MergeExecutor::new( ActorContext::for_test(actor_id), fragment_id, @@ -682,28 +714,6 @@ mod tests { recv!().unwrap().as_chunk().unwrap(); assert_recv_pending!(); - // 4. Send a configuration change barrier. - let merge_updates = maplit::hashmap! { - (actor_id, upstream_fragment_id) => MergeUpdate { - actor_id, - upstream_fragment_id, - new_upstream_fragment_id: None, - added_upstream_actor_id: vec![new], - removed_upstream_actor_id: vec![old], - } - }; - - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: Default::default(), - merges: merge_updates, - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - barrier_test_env.inject_barrier(&b1, [actor_id]); send!( [untouched, old], Message::Barrier(b1.clone().into_dispatcher()) diff --git a/src/stream/src/executor/receiver.rs b/src/stream/src/executor/receiver.rs index 6cabb79388333..9a99e59214bd5 100644 --- a/src/stream/src/executor/receiver.rs +++ b/src/stream/src/executor/receiver.rs @@ -231,6 +231,35 @@ mod tests { let (upstream_fragment_id, fragment_id) = (10, 18); + // 4. Send a configuration change barrier. + let merge_updates = maplit::hashmap! { + (actor_id, upstream_fragment_id) => MergeUpdate { + actor_id, + upstream_fragment_id, + new_upstream_fragment_id: None, + added_upstream_actor_id: vec![new], + removed_upstream_actor_id: vec![old], + } + }; + + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: Default::default(), + merges: merge_updates, + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let input = new_input( &ctx, metrics.clone(), @@ -297,30 +326,6 @@ mod tests { recv!().unwrap().as_chunk().unwrap(); // We should be able to receive the chunk. assert_recv_pending!(); - // 4. Send a configuration change barrier. - let merge_updates = maplit::hashmap! { - (actor_id, upstream_fragment_id) => MergeUpdate { - actor_id, - upstream_fragment_id, - new_upstream_fragment_id: None, - added_upstream_actor_id: vec![new], - removed_upstream_actor_id: vec![old], - } - }; - - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: Default::default(), - merges: merge_updates, - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - - barrier_test_env.inject_barrier(&b1, [actor_id]); - send!([new], Message::Barrier(b1.clone().into_dispatcher())); assert_recv_pending!(); // We should not receive the barrier, as new is not the upstream. diff --git a/src/stream/src/task/barrier_manager.rs b/src/stream/src/task/barrier_manager.rs index 88e86a5998758..67fac31ed8f63 100644 --- a/src/stream/src/task/barrier_manager.rs +++ b/src/stream/src/task/barrier_manager.rs @@ -37,8 +37,7 @@ use tonic::{Code, Status}; use self::managed_state::ManagedBarrierState; use crate::error::{IntoUnexpectedExit, StreamError, StreamResult}; use crate::task::{ - ActorHandle, ActorId, AtomicU64Ref, PartialGraphId, SharedContext, StreamEnvironment, - UpDownActorIds, + ActorId, AtomicU64Ref, PartialGraphId, SharedContext, StreamEnvironment, UpDownActorIds, }; mod managed_state; @@ -210,10 +209,6 @@ pub(super) enum LocalActorOperation { handle: ControlStreamHandle, init_request: InitRequest, }, - DropActors { - actors: Vec, - result_sender: oneshot::Sender<()>, - }, TakeReceiver { ids: UpDownActorIds, result_sender: oneshot::Sender>, @@ -228,29 +223,6 @@ pub(super) enum LocalActorOperation { }, } -pub(crate) struct StreamActorManagerState { - /// Each processor runs in a future. Upon receiving a `Terminate` message, they will exit. - /// `handles` store join handles of these futures, and therefore we could wait their - /// termination. - pub(super) handles: HashMap, - - /// Stores all actor information, taken after actor built. - pub(super) actors: HashMap, - - /// Stores all actor tokio runtime monitoring tasks. - pub(super) actor_monitor_tasks: HashMap, -} - -impl StreamActorManagerState { - fn new() -> Self { - Self { - handles: HashMap::new(), - actors: HashMap::new(), - actor_monitor_tasks: HashMap::new(), - } - } -} - pub(crate) struct StreamActorManager { pub(super) env: StreamEnvironment, pub(super) streaming_metrics: Arc, @@ -294,7 +266,7 @@ impl Display for LocalBarrierWorkerDebugInfo<'_> { /// barriers to and collect them from all actors, and finally report the progress. pub(super) struct LocalBarrierWorker { /// Current barrier collection state. - state: ManagedBarrierState, + pub(super) state: ManagedBarrierState, /// Record all unexpected exited actors. failure_actors: HashMap, @@ -303,8 +275,6 @@ pub(super) struct LocalBarrierWorker { pub(super) actor_manager: Arc, - pub(super) actor_manager_state: StreamActorManagerState, - pub(super) current_shared_context: Arc, barrier_event_rx: UnboundedReceiver, @@ -328,14 +298,9 @@ impl LocalBarrierWorker { )); Self { failure_actors: HashMap::default(), - state: ManagedBarrierState::new( - actor_manager.env.state_store(), - actor_manager.streaming_metrics.clone(), - actor_manager.await_tree_reg.clone(), - ), + state: ManagedBarrierState::new(actor_manager.clone(), shared_context.clone()), control_stream_handle: ControlStreamHandle::empty(), actor_manager, - actor_manager_state: StreamActorManagerState::new(), current_shared_context: shared_context, barrier_event_rx: event_rx, actor_failure_rx: failure_rx, @@ -345,7 +310,7 @@ impl LocalBarrierWorker { fn to_debug_info(&self) -> LocalBarrierWorkerDebugInfo<'_> { LocalBarrierWorkerDebugInfo { - running_actors: self.actor_manager_state.handles.keys().cloned().collect(), + running_actors: self.state.actor_states.keys().cloned().collect(), managed_barrier_state: self.state.to_debug_info(), has_control_stream_connected: self.control_stream_handle.connected(), } @@ -384,7 +349,7 @@ impl LocalBarrierWorker { }); } LocalActorOperation::Shutdown { result_sender } => { - if !self.actor_manager_state.handles.is_empty() { + if !self.state.actor_states.is_empty() { tracing::warn!( "shutdown with running actors, scaling or migration will be triggered" ); @@ -419,15 +384,9 @@ impl LocalBarrierWorker { Request::InjectBarrier(req) => { let barrier = Barrier::from_protobuf(req.get_barrier().unwrap())?; self.update_actor_info(req.broadcast_info)?; - let actors = req - .actors_to_build - .iter() - .map(|actor| actor.actor.as_ref().unwrap().actor_id) - .collect_vec(); - self.update_actors(req.actors_to_build)?; - self.start_create_actors(&actors)?; self.send_barrier( &barrier, + req.actors_to_build, req.actor_ids_to_collect.into_iter().collect(), req.table_ids_to_sync .into_iter() @@ -484,7 +443,13 @@ impl LocalBarrierWorker { .map_err(|e| (actor_id, e))?; } #[cfg(test)] - LocalBarrierEvent::Flush(sender) => sender.send(()).unwrap(), + LocalBarrierEvent::Flush(sender) => { + use futures::FutureExt; + while let Some(request) = self.control_stream_handle.next_request().now_or_never() { + self.handle_streaming_control_request(request).unwrap(); + } + sender.send(()).unwrap() + } } Ok(()) } @@ -494,13 +459,6 @@ impl LocalBarrierWorker { LocalActorOperation::NewControlStream { .. } | LocalActorOperation::Shutdown { .. } => { unreachable!("event {actor_op} should be handled separately in async context") } - LocalActorOperation::DropActors { - actors, - result_sender, - } => { - self.drop_actors(&actors); - let _ = result_sender.send(()); - } LocalActorOperation::TakeReceiver { ids, result_sender } => { let _ = result_sender.send(self.current_shared_context.take_receiver(ids)); } @@ -596,30 +554,12 @@ impl LocalBarrierWorker { fn send_barrier( &mut self, barrier: &Barrier, + to_build: Vec, to_collect: HashSet, table_ids: HashSet, partial_graph_id: PartialGraphId, actor_ids_to_pre_sync_barrier: HashSet, ) -> StreamResult<()> { - if !cfg!(test) { - // The barrier might be outdated and been injected after recovery in some certain extreme - // scenarios. So some newly creating actors in the barrier are possibly not rebuilt during - // recovery. Check it here and return an error here if some actors are not found to - // avoid collection hang. We need some refine in meta side to remove this workaround since - // it will cause another round of unnecessary recovery. - let missing_actor_ids = to_collect - .iter() - .filter(|id| !self.actor_manager_state.handles.contains_key(id)) - .collect_vec(); - if !missing_actor_ids.is_empty() { - tracing::warn!( - "to collect actors not found, they should be cleaned when recovering: {:?}", - missing_actor_ids - ); - return Err(anyhow!("to collect actors not found: {:?}", to_collect).into()); - } - } - if barrier.kind == BarrierKind::Initial { self.actor_manager .watermark_epoch @@ -647,20 +587,12 @@ impl LocalBarrierWorker { self.state.transform_to_issued( barrier, + to_build, to_collect, table_ids, partial_graph_id, actor_ids_to_pre_sync_barrier, )?; - - // Actors to stop should still accept this barrier, but won't get sent to in next times. - if let Some(actors) = barrier.all_stop_actors() { - debug!( - target: "events::stream::barrier::manager", - "remove actors {:?} from senders", - actors - ); - } Ok(()) } diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index 6f21e32adc107..ddc0c7f13bab5 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -15,7 +15,7 @@ use std::assert_matches::assert_matches; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::{Debug, Display, Formatter}; -use std::future::{poll_fn, Future}; +use std::future::{pending, poll_fn, Future}; use std::mem::replace; use std::sync::Arc; use std::task::{ready, Context, Poll}; @@ -32,16 +32,18 @@ use risingwave_common::util::epoch::EpochPair; use risingwave_hummock_sdk::SyncResult; use risingwave_pb::stream_plan::barrier::BarrierKind; use risingwave_pb::stream_service::barrier_complete_response::CreateMviewProgress; +use risingwave_pb::stream_service::BuildActorInfo; use risingwave_storage::{dispatch_state_store, StateStore, StateStoreImpl}; use thiserror_ext::AsReport; use tokio::sync::mpsc; +use tokio::task::JoinHandle; use super::progress::BackfillState; use super::{BarrierCompleteResult, SubscribeMutationItem}; use crate::error::{StreamError, StreamResult}; use crate::executor::monitor::StreamingMetrics; use crate::executor::{Barrier, Mutation}; -use crate::task::{await_tree_key, ActorId, PartialGraphId}; +use crate::task::{await_tree_key, ActorId, PartialGraphId, SharedContext, StreamActorManager}; struct IssuedState { pub mutation: Option>, @@ -83,12 +85,12 @@ enum ManagedBarrierStateInner { #[derive(Debug)] pub(super) struct BarrierState { - curr_epoch: u64, + barrier: Barrier, inner: ManagedBarrierStateInner, } type AwaitEpochCompletedFuture = - impl Future)> + 'static; + impl Future)> + 'static; fn sync_epoch( state_store: &S, @@ -192,8 +194,6 @@ impl Display for &'_ PartialGraphManagedBarrierState { } enum InflightActorStatus { - /// The actor is just spawned and not issued any barrier yet - NotStarted, /// The actor has been issued some barriers, but has not collected the first barrier IssuedFirst(Vec), /// The actor has been issued some barriers, and has collected the first barrier @@ -201,12 +201,11 @@ enum InflightActorStatus { } impl InflightActorStatus { - fn max_issued_epoch(&self) -> Option { + fn max_issued_epoch(&self) -> u64 { match self { - InflightActorStatus::NotStarted => None, - InflightActorStatus::Running(epoch) => Some(*epoch), + InflightActorStatus::Running(epoch) => *epoch, InflightActorStatus::IssuedFirst(issued_barriers) => { - Some(issued_barriers.last().expect("non-empty").epoch.prev) + issued_barriers.last().expect("non-empty").epoch.prev } } } @@ -223,18 +222,35 @@ pub(crate) struct InflightActorState { status: InflightActorStatus, /// Whether the actor has been issued a stop barrier is_stopping: bool, + + join_handle: JoinHandle<()>, + monitor_task_handle: Option>, } impl InflightActorState { - pub(super) fn not_started(actor_id: ActorId) -> Self { + pub(super) fn start( + actor_id: ActorId, + initial_partial_graph_id: PartialGraphId, + initial_barrier: &Barrier, + join_handle: JoinHandle<()>, + monitor_task_handle: Option>, + ) -> Self { Self { actor_id, pending_subscribers: Default::default(), barrier_senders: vec![], - inflight_barriers: BTreeMap::default(), - barrier_mutations: Default::default(), - status: InflightActorStatus::NotStarted, + inflight_barriers: BTreeMap::from_iter([( + initial_barrier.epoch.prev, + initial_partial_graph_id, + )]), + barrier_mutations: BTreeMap::from_iter([( + initial_barrier.epoch.prev, + (initial_barrier.mutation.clone(), initial_barrier.epoch.curr), + )]), + status: InflightActorStatus::IssuedFirst(vec![initial_barrier.clone()]), is_stopping: false, + join_handle, + monitor_task_handle, } } @@ -263,9 +279,7 @@ impl InflightActorState { barrier: &Barrier, is_stop: bool, ) -> StreamResult<()> { - if let Some(max_issued_epoch) = self.status.max_issued_epoch() { - assert!(barrier.epoch.prev > max_issued_epoch); - } + assert!(barrier.epoch.prev > self.status.max_issued_epoch()); if let Some((first_epoch, _)) = self.pending_subscribers.first_key_value() { assert!( @@ -312,9 +326,6 @@ impl InflightActorState { } match &mut self.status { - InflightActorStatus::NotStarted => { - self.status = InflightActorStatus::IssuedFirst(vec![barrier.clone()]); - } InflightActorStatus::IssuedFirst(pending_barriers) => { pending_barriers.push(barrier.clone()); } @@ -338,9 +349,6 @@ impl InflightActorState { let (min_mutation_epoch, _) = self.barrier_mutations.pop_first().expect("should exist"); assert_eq!(min_mutation_epoch, epoch.prev); match &self.status { - InflightActorStatus::NotStarted => { - unreachable!("should have issued a barrier when collect") - } InflightActorStatus::IssuedFirst(pending_barriers) => { assert_eq!( prev_epoch, @@ -419,32 +427,27 @@ impl PartialGraphManagedBarrierState { } } -pub(super) struct ManagedBarrierState { +pub(crate) struct ManagedBarrierState { pub(super) actor_states: HashMap, pub(super) graph_states: HashMap, - pub(super) state_store: StateStoreImpl, - - pub(super) streaming_metrics: Arc, + actor_manager: Arc, - /// Manages the await-trees of all barriers. - barrier_await_tree_reg: Option, + current_shared_context: Arc, } impl ManagedBarrierState { /// Create a barrier manager state. This will be called only once. pub(super) fn new( - state_store: StateStoreImpl, - streaming_metrics: Arc, - barrier_await_tree_reg: Option, + actor_manager: Arc, + current_shared_context: Arc, ) -> Self { Self { actor_states: Default::default(), graph_states: Default::default(), - state_store, - streaming_metrics, - barrier_await_tree_reg, + actor_manager, + current_shared_context, } } @@ -453,6 +456,21 @@ impl ManagedBarrierState { graph_states: &self.graph_states, } } + + pub(crate) async fn abort_actors(&mut self) { + for (actor_id, state) in &self.actor_states { + tracing::debug!("force stopping actor {}", actor_id); + state.join_handle.abort(); + if let Some(monitor_task_handle) = &state.monitor_task_handle { + monitor_task_handle.abort(); + } + } + for (actor_id, state) in self.actor_states.drain() { + tracing::debug!("join actor {}", actor_id); + let result = state.join_handle.await; + assert!(result.is_ok() || result.unwrap_err().is_cancelled()); + } + } } impl InflightActorState { @@ -488,17 +506,13 @@ impl InflightActorState { .push(tx); } } else { - // Barrier has not issued yet. Store the pending tx - if let Some(max_issued_epoch) = self.status.max_issued_epoch() { - assert!( - max_issued_epoch < start_prev_epoch, - "later barrier {} has been issued, but skip the start epoch {:?}", - max_issued_epoch, - start_prev_epoch - ); - } else { - assert!(!self.is_stopping, "actor has been stopped and has not inflight barrier. unlikely to get further barrier"); - } + let max_issued_epoch = self.status.max_issued_epoch(); + assert!( + max_issued_epoch < start_prev_epoch, + "later barrier {} has been issued, but skip the start epoch {:?}", + max_issued_epoch, + start_prev_epoch + ); self.pending_subscribers .entry(start_prev_epoch) .or_default() @@ -511,9 +525,6 @@ impl InflightActorState { tx: mpsc::UnboundedSender, ) -> StreamResult<()> { match &self.status { - InflightActorStatus::NotStarted => { - self.barrier_senders.push(tx); - } InflightActorStatus::IssuedFirst(pending_barriers) => { for barrier in pending_barriers { tx.send(barrier.clone()).map_err(|_| { @@ -542,8 +553,8 @@ impl ManagedBarrierState { tx: mpsc::UnboundedSender, ) { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .subscribe_actor_mutation(start_prev_epoch, tx); } @@ -553,53 +564,105 @@ impl ManagedBarrierState { tx: mpsc::UnboundedSender, ) -> StreamResult<()> { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .register_barrier_sender(tx) } pub(super) fn transform_to_issued( &mut self, barrier: &Barrier, + actors_to_build: Vec, actor_ids_to_collect: HashSet, table_ids: HashSet, partial_graph_id: PartialGraphId, actor_ids_to_pre_sync_barrier: HashSet, ) -> StreamResult<()> { let actor_to_stop = barrier.all_stop_actors(); + let is_stop_actor = |actor_id| { + actor_to_stop + .map(|actors| actors.contains(&actor_id)) + .unwrap_or(false) + }; let graph_state = self .graph_states .entry(partial_graph_id) .or_insert_with(|| { PartialGraphManagedBarrierState::new( - self.state_store.clone(), - self.streaming_metrics.clone(), - self.barrier_await_tree_reg.clone(), + self.actor_manager.env.state_store(), + self.actor_manager.streaming_metrics.clone(), + self.actor_manager.await_tree_reg.clone(), ) }); graph_state.transform_to_issued(barrier, actor_ids_to_collect.clone(), table_ids); + let mut new_actors = HashSet::new(); + for actor in actors_to_build { + let actor_id = actor.actor.as_ref().unwrap().actor_id; + assert!(!is_stop_actor(actor_id)); + assert!(new_actors.insert(actor_id)); + assert!(actor_ids_to_collect.contains(&actor_id)); + let (join_handle, monitor_join_handle) = self + .actor_manager + .spawn_actor(actor, self.current_shared_context.clone()); + assert!(self + .actor_states + .try_insert( + actor_id, + InflightActorState::start( + actor_id, + partial_graph_id, + barrier, + join_handle, + monitor_join_handle + ) + ) + .is_ok()); + } + + // Spawn a trivial join handle to be compatible with the unit test + if cfg!(test) { + for actor_id in &actor_ids_to_collect { + if !self.actor_states.contains_key(actor_id) { + let join_handle = self.actor_manager.runtime.spawn(async { pending().await }); + assert!(self + .actor_states + .try_insert( + *actor_id, + InflightActorState::start( + *actor_id, + partial_graph_id, + barrier, + join_handle, + None, + ) + ) + .is_ok()); + new_actors.insert(*actor_id); + } + } + } + // Note: it's important to issue barrier to actor after issuing to graph to ensure that // we call `start_epoch` on the graph before the actors receive the barrier - for actor_id in actor_ids_to_collect { + for actor_id in &actor_ids_to_collect { + if new_actors.contains(actor_id) { + continue; + } self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) - .issue_barrier( - partial_graph_id, - barrier, - actor_to_stop - .map(|actors| actors.contains(&actor_id)) - .unwrap_or(false), - )?; + .get_mut(actor_id) + .unwrap_or_else(|| { + panic!("should exist: {} {:?}", actor_id, actor_ids_to_collect); + }) + .issue_barrier(partial_graph_id, barrier, is_stop_actor(*actor_id))?; } if partial_graph_id.is_global_graph() { for actor_id in actor_ids_to_pre_sync_barrier { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .sync_barrier(barrier); } } else { @@ -613,9 +676,12 @@ impl ManagedBarrierState { ) -> impl Future + '_ { poll_fn(|cx| { for (partial_graph_id, graph_state) in &mut self.graph_states { - if let Poll::Ready(epoch) = graph_state.poll_next_completed_epoch(cx) { + if let Poll::Ready(barrier) = graph_state.poll_next_completed_barrier(cx) { + if let Some(actors_to_stop) = barrier.all_stop_actors() { + self.current_shared_context.drop_actors(actors_to_stop); + } let partial_graph_id = *partial_graph_id; - return Poll::Ready((partial_graph_id, epoch)); + return Poll::Ready((partial_graph_id, barrier.epoch.prev)); } } Poll::Pending @@ -629,7 +695,10 @@ impl ManagedBarrierState { .expect("should exist") .collect(epoch); if is_finished { - self.actor_states.remove(&actor_id); + let state = self.actor_states.remove(&actor_id).expect("should exist"); + if let Some(monitor_task_handle) = state.monitor_task_handle { + monitor_task_handle.abort(); + } } let prev_graph_state = self .graph_states @@ -680,7 +749,7 @@ impl PartialGraphManagedBarrierState { let create_mview_progress = self .create_mview_progress - .remove(&barrier_state.curr_epoch) + .remove(&barrier_state.barrier.epoch.curr) .unwrap_or_default() .into_iter() .map(|(actor, state)| CreateMviewProgress { @@ -688,7 +757,7 @@ impl PartialGraphManagedBarrierState { done: matches!(state, BackfillState::Done(_)), consumed_epoch: match state { BackfillState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch, - BackfillState::Done(_) => barrier_state.curr_epoch, + BackfillState::Done(_) => barrier_state.barrier.epoch.curr, }, consumed_rows: match state { BackfillState::ConsumingUpstream(_, consumed_rows) => consumed_rows, @@ -727,6 +796,8 @@ impl PartialGraphManagedBarrierState { } }; + let barrier = barrier_state.barrier.clone(); + self.await_epoch_completed_futures.push_back({ let future = async move { if let Some(future) = complete_barrier_future { @@ -738,7 +809,7 @@ impl PartialGraphManagedBarrierState { } .map(move |result| { ( - prev_epoch, + barrier, result.map(|sync_result| BarrierCompleteResult { sync_result, create_mview_progress, @@ -778,7 +849,7 @@ impl PartialGraphManagedBarrierState { ) } Some(&mut BarrierState { - curr_epoch, + ref barrier, inner: ManagedBarrierStateInner::Issued(IssuedState { ref mut remaining_actors, @@ -792,7 +863,7 @@ impl PartialGraphManagedBarrierState { "the actor doesn't exist. actor_id: {:?}, curr_epoch: {:?}", actor_id, epoch.curr ); - assert_eq!(curr_epoch, epoch.curr); + assert_eq!(barrier.epoch.curr, epoch.curr); self.may_have_collected_all(epoch.prev); } Some(BarrierState { inner, .. }) => { @@ -874,7 +945,7 @@ impl PartialGraphManagedBarrierState { self.epoch_barrier_state_map.insert( barrier.epoch.prev, BarrierState { - curr_epoch: barrier.epoch.curr, + barrier: barrier.clone(), inner: ManagedBarrierStateInner::Issued(IssuedState { remaining_actors: BTreeSet::from_iter(actor_ids_to_collect), mutation: barrier.mutation.clone(), @@ -888,17 +959,17 @@ impl PartialGraphManagedBarrierState { } /// Return a future that yields the next completed epoch. The future is cancellation safe. - pub(crate) fn poll_next_completed_epoch(&mut self, cx: &mut Context<'_>) -> Poll { + pub(crate) fn poll_next_completed_barrier(&mut self, cx: &mut Context<'_>) -> Poll { ready!(self.await_epoch_completed_futures.next().poll_unpin(cx)) - .map(|(prev_epoch, result)| { + .map(|(barrier, result)| { let state = self .epoch_barrier_state_map - .get_mut(&prev_epoch) + .get_mut(&barrier.epoch.prev) .expect("should exist"); // sanity check on barrier state assert_matches!(&state.inner, ManagedBarrierStateInner::AllCollected); state.inner = ManagedBarrierStateInner::Completed(result); - prev_epoch + barrier }) .map(Poll::Ready) .unwrap_or(Poll::Pending) @@ -944,9 +1015,12 @@ impl PartialGraphManagedBarrierState { #[cfg(test)] async fn pop_next_completed_epoch(&mut self) -> u64 { - let epoch = poll_fn(|cx| self.poll_next_completed_epoch(cx)).await; - let _ = self.pop_completed_epoch(epoch).unwrap().unwrap(); - epoch + let barrier = poll_fn(|cx| self.poll_next_completed_barrier(cx)).await; + let _ = self + .pop_completed_epoch(barrier.epoch.prev) + .unwrap() + .unwrap(); + barrier.epoch.prev } } diff --git a/src/stream/src/task/barrier_manager/tests.rs b/src/stream/src/task/barrier_manager/tests.rs index d6a8256aebb61..112ee533d8e6d 100644 --- a/src/stream/src/task/barrier_manager/tests.rs +++ b/src/stream/src/task/barrier_manager/tests.rs @@ -40,19 +40,22 @@ async fn test_managed_barrier_collection() -> StreamResult<()> { // Register actors let actor_ids = vec![233, 234, 235]; - let count = actor_ids.len(); - let mut rxs = actor_ids - .clone() - .into_iter() - .map(register_sender) - .collect_vec(); // Send a barrier to all actors let curr_epoch = test_epoch(2); let barrier = Barrier::new_test_barrier(curr_epoch); let epoch = barrier.epoch.prev; - test_env.inject_barrier(&barrier, actor_ids); + test_env.inject_barrier(&barrier, actor_ids.clone()); + + manager.flush_all_events().await; + + let count = actor_ids.len(); + let mut rxs = actor_ids + .clone() + .into_iter() + .map(register_sender) + .collect_vec(); // Collect barriers from actors let collected_barriers = join_all(rxs.iter_mut().map(|(actor_id, rx)| async move { @@ -105,6 +108,14 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { .chain(once(extra_actor_id)) .collect_vec(); + // Prepare the barrier + let curr_epoch = test_epoch(2); + let barrier = Barrier::new_test_barrier(curr_epoch).with_stop(); + + test_env.inject_barrier(&barrier, actor_ids_to_collect.clone()); + + manager.flush_all_events().await; + // Register actors let count = actor_ids_to_send.len(); let mut rxs = actor_ids_to_send @@ -113,10 +124,6 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { .map(register_sender) .collect_vec(); - // Prepare the barrier - let curr_epoch = test_epoch(2); - let barrier = Barrier::new_test_barrier(curr_epoch).with_stop(); - let mut mutation_subscriber = manager.subscribe_barrier_mutation(extra_actor_id, &barrier.clone().into_dispatcher()); @@ -124,8 +131,6 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { let mut mutation_reader = pin!(mutation_subscriber.recv()); assert!(poll_fn(|cx| Poll::Ready(mutation_reader.as_mut().poll(cx).is_pending())).await); - test_env.inject_barrier(&barrier, actor_ids_to_collect); - let (epoch, mutation) = mutation_reader.await.unwrap(); assert_eq!((epoch, &mutation), (barrier.epoch.prev, &barrier.mutation)); @@ -196,6 +201,8 @@ async fn test_late_register_barrier_sender() -> StreamResult<()> { test_env.inject_barrier(&barrier1, actor_ids_to_collect.clone()); test_env.inject_barrier(&barrier2, actor_ids_to_collect.clone()); + manager.flush_all_events().await; + // register sender after inject barrier let mut rxs = actor_ids_to_send .clone() diff --git a/src/stream/src/task/mod.rs b/src/stream/src/task/mod.rs index b5382b3418052..59851fdf09ad8 100644 --- a/src/stream/src/task/mod.rs +++ b/src/stream/src/task/mod.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use anyhow::anyhow; use parking_lot::{MappedMutexGuard, Mutex, MutexGuard, RwLock}; @@ -194,7 +194,7 @@ impl SharedContext { &self.config } - pub fn drop_actors(&self, actors: &[ActorId]) { + pub(super) fn drop_actors(&self, actors: &HashSet) { self.channel_map .lock() .retain(|(up_id, _), _| !actors.contains(up_id)); diff --git a/src/stream/src/task/stream_manager.rs b/src/stream/src/task/stream_manager.rs index 60b7341371497..ba76e6fab791d 100644 --- a/src/stream/src/task/stream_manager.rs +++ b/src/stream/src/task/stream_manager.rs @@ -19,7 +19,6 @@ use std::sync::atomic::AtomicU64; use std::sync::Arc; use std::time::Instant; -use anyhow::anyhow; use async_recursion::async_recursion; use await_tree::InstrumentAwait; use futures::stream::BoxStream; @@ -59,8 +58,8 @@ use crate::task::barrier_manager::{ ControlStreamHandle, EventSender, LocalActorOperation, LocalBarrierWorker, }; use crate::task::{ - ActorId, FragmentId, LocalBarrierManager, SharedContext, StreamActorManager, - StreamActorManagerState, StreamEnvironment, UpDownActorIds, + ActorId, FragmentId, LocalBarrierManager, SharedContext, StreamActorManager, StreamEnvironment, + UpDownActorIds, }; #[cfg(test)] @@ -214,16 +213,6 @@ impl LocalStreamManager { }) } - /// Drop the resources of the given actors. - pub async fn drop_actors(&self, actors: Vec) -> StreamResult<()> { - self.actor_op_tx - .send_and_await(|result_sender| LocalActorOperation::DropActors { - actors, - result_sender, - }) - .await - } - pub async fn take_receiver(&self, ids: UpDownActorIds) -> StreamResult { self.actor_op_tx .send_and_await(|result_sender| LocalActorOperation::TakeReceiver { @@ -256,28 +245,9 @@ impl LocalStreamManager { } impl LocalBarrierWorker { - /// Drop the resources of the given actors. - pub(super) fn drop_actors(&mut self, actors: &[ActorId]) { - self.current_shared_context.drop_actors(actors); - for &id in actors { - self.actor_manager_state.drop_actor(id); - } - tracing::debug!(actors = ?actors, "drop actors"); - } - /// Force stop all actors on this worker, and then drop their resources. pub(super) async fn reset(&mut self, version_id: HummockVersionId) { - let actor_handles = self.actor_manager_state.drain_actor_handles(); - for (actor_id, handle) in &actor_handles { - tracing::debug!("force stopping actor {}", actor_id); - handle.abort(); - } - for (actor_id, handle) in actor_handles { - tracing::debug!("join actor {}", actor_id); - let result = handle.await; - assert!(result.is_ok() || result.unwrap_err().is_cancelled()); - } - self.actor_manager_state.clear_state(); + self.state.abort_actors().await; if let Some(m) = self.actor_manager.await_tree_reg.as_ref() { m.clear(); } @@ -291,26 +261,6 @@ impl LocalBarrierWorker { self.reset_state(); self.actor_manager.env.dml_manager_ref().clear(); } - - pub(super) fn update_actors(&mut self, actors: Vec) -> StreamResult<()> { - self.actor_manager_state.update_actors(actors) - } - - /// This function could only be called once during the lifecycle of `LocalStreamManager` for - /// now. - pub(super) fn start_create_actors(&mut self, actors: &[ActorId]) -> StreamResult<()> { - let actors: Vec<_> = actors - .iter() - .map(|actor_id| { - self.actor_manager_state - .actors - .remove(actor_id) - .ok_or_else(|| anyhow!("No such actor with actor id:{}", actor_id)) - }) - .try_collect()?; - self.spawn_actors(actors); - Ok(()) - } } impl StreamActorManager { @@ -559,18 +509,22 @@ impl StreamActorManager { } } -impl LocalBarrierWorker { - pub(super) fn spawn_actors(&mut self, actors: Vec) { - for actor in actors { +impl StreamActorManager { + pub(super) fn spawn_actor( + self: &Arc, + actor: BuildActorInfo, + current_shared_context: Arc, + ) -> (JoinHandle<()>, Option>) { + { let monitor = tokio_metrics::TaskMonitor::new(); let stream_actor_ref = actor.actor.as_ref().unwrap(); let actor_id = stream_actor_ref.actor_id; let handle = { let trace_span = format!("Actor {actor_id}: `{}`", stream_actor_ref.mview_definition); - let barrier_manager = self.current_shared_context.local_barrier_manager.clone(); + let barrier_manager = current_shared_context.local_barrier_manager.clone(); // wrap the future of `create_actor` with `boxed` to avoid stack overflow - let actor = self.actor_manager.clone().create_actor(actor, self.current_shared_context.clone()).boxed().and_then(|actor| actor.run()).map(move |result| { + let actor = self.clone().create_actor(actor, current_shared_context).boxed().and_then(|actor| actor.run()).map(move |result| { if let Err(err) = result { // TODO: check error type and panic if it's unexpected. // Intentionally use `?` on the report to also include the backtrace. @@ -578,7 +532,7 @@ impl LocalBarrierWorker { barrier_manager.notify_failure(actor_id, err); } }); - let traced = match &self.actor_manager.await_tree_reg { + let traced = match &self.await_tree_reg { Some(m) => m .register(await_tree_key::Actor(actor_id), trace_span) .instrument(actor) @@ -586,24 +540,17 @@ impl LocalBarrierWorker { None => actor.right_future(), }; let instrumented = monitor.instrument(traced); - let with_config = - crate::CONFIG.scope(self.actor_manager.env.config().clone(), instrumented); + let with_config = crate::CONFIG.scope(self.env.config().clone(), instrumented); - self.actor_manager.runtime.spawn(with_config) + self.runtime.spawn(with_config) }; - self.actor_manager_state.handles.insert(actor_id, handle); - - if self.actor_manager.streaming_metrics.level >= MetricLevel::Debug - || self - .actor_manager - .env - .config() - .developer - .enable_actor_tokio_metrics + + let monitor_handle = if self.streaming_metrics.level >= MetricLevel::Debug + || self.env.config().developer.enable_actor_tokio_metrics { tracing::info!("Tokio metrics are enabled."); - let streaming_metrics = self.actor_manager.streaming_metrics.clone(); - let actor_monitor_task = self.actor_manager.runtime.spawn(async move { + let streaming_metrics = self.streaming_metrics.clone(); + let actor_monitor_task = self.runtime.spawn(async move { let metrics = streaming_metrics.new_actor_metrics(actor_id); loop { let task_metrics = monitor.cumulative(); @@ -643,10 +590,11 @@ impl LocalBarrierWorker { tokio::time::sleep(Duration::from_secs(1)).await; } }); - self.actor_manager_state - .actor_monitor_tasks - .insert(actor_id, actor_monitor_task); - } + Some(actor_monitor_task) + } else { + None + }; + (handle, monitor_handle) } } } @@ -671,44 +619,6 @@ impl LocalBarrierWorker { } } -impl StreamActorManagerState { - /// `drop_actor` is invoked by meta node via RPC once the stop barrier arrives at the - /// sink. All the actors in the actors should stop themselves before this method is invoked. - fn drop_actor(&mut self, actor_id: ActorId) { - self.actor_monitor_tasks - .remove(&actor_id) - .inspect(|handle| handle.abort()); - self.actors.remove(&actor_id); - - // Task should have already stopped when this method is invoked. There might be some - // clean-up work left (like dropping in-memory data structures), but we don't have to wait - // for them to finish, in order to make this request non-blocking. - self.handles.remove(&actor_id); - } - - fn drain_actor_handles(&mut self) -> Vec<(ActorId, ActorHandle)> { - self.handles.drain().collect() - } - - /// `stop_all_actors` is invoked by meta node via RPC for recovery purpose. Different from the - /// `drop_actor`, the execution of the actors will be aborted. - fn clear_state(&mut self) { - self.actors.clear(); - self.actor_monitor_tasks.clear(); - } - - fn update_actors(&mut self, actors: Vec) -> StreamResult<()> { - for actor in actors { - let actor_id = actor.actor.as_ref().unwrap().get_actor_id(); - self.actors - .try_insert(actor_id, actor) - .map_err(|_| anyhow!("duplicated actor {}", actor_id))?; - } - - Ok(()) - } -} - #[cfg(test)] pub mod test_utils { use risingwave_pb::common::HostAddress; From 3762877b55bb5187533023b582c0f7f8901f0323 Mon Sep 17 00:00:00 2001 From: Li0k Date: Fri, 6 Sep 2024 16:06:49 +0800 Subject: [PATCH 15/32] fix(storage): fix commit_epoch new_sst_id_number (#18438) --- src/meta/src/hummock/manager/commit_epoch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs index 4f7a62da41779..e7bca768437f2 100644 --- a/src/meta/src/hummock/manager/commit_epoch.rs +++ b/src/meta/src/hummock/manager/commit_epoch.rs @@ -397,7 +397,7 @@ impl HummockManager { } } - new_sst_id_number += group_table_ids.len(); + new_sst_id_number += group_table_ids.len() * 2; // `split_sst` will split the SST into two parts and consumer 2 SST IDs sst_to_cg_vec.push((commit_sst, group_table_ids)); } From e2c89f418e8dbe5fd5b078ea8d57288f438d49d0 Mon Sep 17 00:00:00 2001 From: xxchan Date: Fri, 6 Sep 2024 16:24:12 +0800 Subject: [PATCH 16/32] feat: support scale SourceBackfill executor (#16825) Signed-off-by: xxchan --- .typos.toml | 1 + Cargo.lock | 1 + ci/scripts/e2e-source-test.sh | 2 +- e2e_test/commands/risectl | 3 + .../source_inline/kafka/shared_source.slt | 81 ++++++++ src/ctl/src/cmd_impl/meta/cluster_info.rs | 96 ++++++++- src/ctl/src/lib.rs | 9 +- src/meta/src/rpc/ddl_controller.rs | 2 + src/meta/src/stream/scale.rs | 111 ++++++++-- src/meta/src/stream/source_manager.rs | 57 +++++- src/tests/simulation/Cargo.toml | 1 + src/tests/simulation/src/cluster.rs | 29 +-- src/tests/simulation/src/ctl_ext.rs | 34 +++- .../tests/integration_tests/scale/mod.rs | 1 + .../integration_tests/scale/shared_source.rs | 192 ++++++++++++++++++ 15 files changed, 558 insertions(+), 62 deletions(-) create mode 100755 e2e_test/commands/risectl create mode 100644 src/tests/simulation/tests/integration_tests/scale/shared_source.rs diff --git a/.typos.toml b/.typos.toml index 4d4bbfca1c082..498d954a55d88 100644 --- a/.typos.toml +++ b/.typos.toml @@ -36,4 +36,5 @@ extend-exclude = [ # We don't want to fix "fals" here, but may want in other places. # Ideally, we should just ignore that line: https://github.com/crate-ci/typos/issues/316 "src/common/src/cast/mod.rs", + "src/tests/simulation/tests/integration_tests/scale/shared_source.rs", ] diff --git a/Cargo.lock b/Cargo.lock index 99423896f89f2..32c5fe29fc5aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11561,6 +11561,7 @@ dependencies = [ "madsim-etcd-client", "madsim-rdkafka", "madsim-tokio", + "maplit", "paste", "pin-project", "pretty_assertions", diff --git a/ci/scripts/e2e-source-test.sh b/ci/scripts/e2e-source-test.sh index 56a06ac756931..29f2a0ac7b5ce 100755 --- a/ci/scripts/e2e-source-test.sh +++ b/ci/scripts/e2e-source-test.sh @@ -130,7 +130,7 @@ echo "> inserted new rows into postgres" # start cluster w/o clean-data unset RISINGWAVE_CI -export RUST_LOG="risingwave_stream=debug,risingwave_batch=info,risingwave_storage=info" \ +export RUST_LOG="risingwave_stream=debug,risingwave_batch=info,risingwave_storage=info" risedev dev ci-1cn-1fe-with-recovery echo "> wait for cluster recovery finish" diff --git a/e2e_test/commands/risectl b/e2e_test/commands/risectl new file mode 100755 index 0000000000000..2bb462d83fbab --- /dev/null +++ b/e2e_test/commands/risectl @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +RUST_LOG="error" .risingwave/bin/risingwave/risectl "$@" diff --git a/e2e_test/source_inline/kafka/shared_source.slt b/e2e_test/source_inline/kafka/shared_source.slt index 51a9f1e5ee1b3..5d1072df2cfaa 100644 --- a/e2e_test/source_inline/kafka/shared_source.slt +++ b/e2e_test/source_inline/kafka/shared_source.slt @@ -230,6 +230,87 @@ internal_table.mjs --name s0 --type source 3,"{""split_info"": {""partition"": 3, ""start_offset"": 11, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" +# # Note: the parallelism depends on the risedev profile. +# # So scale tests below are commented out. + +# query ??? +# select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name; +# ---- +# mv_1 {MVIEW,SOURCE_SCAN} 5 +# mv_2 {MVIEW,SOURCE_SCAN} 5 +# s0 {SOURCE} 5 + + +# system ok +# risectl meta source-split-info --ignore-id +# ---- +# Table +# Fragment (Source) +# Actor (1 splits): [0] +# Actor (1 splits): [2] +# Actor (1 splits): [3] +# Actor (1 splits): [1] +# Actor (0 splits): [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] + + +# # scale down +# statement ok +# ALTER MATERIALIZED VIEW mv_1 SET PARALLELISM TO 2; + +# # should have no effect, because of NoShuffle +# # TODO: support ALTER SOURCE SET PARALLELISM, then we can +# query ??? +# select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name; +# ---- +# mv_1 {MVIEW,SOURCE_SCAN} 5 +# mv_2 {MVIEW,SOURCE_SCAN} 5 +# s0 {SOURCE} 5 + +# system ok +# risectl meta source-split-info --ignore-id +# ---- +# Table +# Fragment (Source) +# Actor (1 splits): [0] +# Actor (1 splits): [2] +# Actor (1 splits): [3] +# Actor (1 splits): [1] +# Actor (0 splits): [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] + + +# # Manual test: change the parallelism of the compute node, kill and restart, and check +# # risedev ctl meta source-split-info --ignore-id +# # risedev psql -c "select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name;" + + statement ok drop source s0 cascade; diff --git a/src/ctl/src/cmd_impl/meta/cluster_info.rs b/src/ctl/src/cmd_impl/meta/cluster_info.rs index cbc21ca6ec610..76b91d37fbd3c 100644 --- a/src/ctl/src/cmd_impl/meta/cluster_info.rs +++ b/src/ctl/src/cmd_impl/meta/cluster_info.rs @@ -31,7 +31,7 @@ pub async fn get_cluster_info(context: &CtlContext) -> anyhow::Result anyhow::Result<()> { +pub async fn source_split_info(context: &CtlContext, ignore_id: bool) -> anyhow::Result<()> { let GetClusterInfoResponse { worker_nodes: _, source_infos: _, @@ -40,37 +40,113 @@ pub async fn source_split_info(context: &CtlContext) -> anyhow::Result<()> { revision: _, } = get_cluster_info(context).await?; + let mut actor_splits_map: BTreeMap = BTreeMap::new(); + + // build actor_splits_map for table_fragment in &table_fragments { if table_fragment.actor_splits.is_empty() { continue; } - println!("Table #{}", table_fragment.table_id); - for fragment in table_fragment.fragments.values() { let fragment_type_mask = fragment.fragment_type_mask; if fragment_type_mask & FragmentTypeFlag::Source as u32 == 0 - || fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 + && fragment_type_mask & FragmentTypeFlag::SourceScan as u32 == 0 { + // no source or source backfill + continue; + } + if fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 { // skip dummy source for dml fragment continue; } - println!("\tFragment #{}", fragment.fragment_id); for actor in &fragment.actors { if let Some(ConnectorSplits { splits }) = actor_splits.remove(&actor.actor_id) { let splits = splits .iter() .map(|split| SplitImpl::try_from(split).unwrap()) .map(|split| split.id()) - .collect_vec(); + .collect_vec() + .join(","); + actor_splits_map.insert(actor.actor_id, (splits.len(), splits)); + } + } + } + } + // print in the second iteration. Otherwise we don't have upstream splits info + for table_fragment in &table_fragments { + if table_fragment.actor_splits.is_empty() { + continue; + } + if ignore_id { + println!("Table"); + } else { + println!("Table #{}", table_fragment.table_id); + } + for fragment in table_fragment.fragments.values() { + let fragment_type_mask = fragment.fragment_type_mask; + if fragment_type_mask & FragmentTypeFlag::Source as u32 == 0 + && fragment_type_mask & FragmentTypeFlag::SourceScan as u32 == 0 + { + // no source or source backfill + continue; + } + if fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 { + // skip dummy source for dml fragment + continue; + } + + println!( + "\tFragment{} ({})", + if ignore_id { + "".to_string() + } else { + format!(" #{}", fragment.fragment_id) + }, + if fragment_type_mask == FragmentTypeFlag::Source as u32 { + "Source" + } else { + "SourceScan" + } + ); + for actor in &fragment.actors { + if let Some((split_count, splits)) = actor_splits_map.get(&actor.actor_id) { println!( - "\t\tActor #{:<3} ({}): [{}]", - actor.actor_id, - splits.len(), - splits.join(",") + "\t\tActor{} ({} splits): [{}]{}", + if ignore_id { + "".to_string() + } else { + format!(" #{:<3}", actor.actor_id,) + }, + split_count, + splits, + if !actor.upstream_actor_id.is_empty() { + assert!( + actor.upstream_actor_id.len() == 1, + "should have only one upstream actor, got {actor:?}" + ); + let upstream_splits = + actor_splits_map.get(&actor.upstream_actor_id[0]).unwrap(); + format!( + " <- Upstream Actor{}: [{}]", + if ignore_id { + "".to_string() + } else { + format!(" #{}", actor.upstream_actor_id[0]) + }, + upstream_splits.1 + ) + } else { + "".to_string() + } ); + } else { + println!( + "\t\tError: Actor #{:<3} (not found in actor_splits)", + actor.actor_id, + ) } } } diff --git a/src/ctl/src/lib.rs b/src/ctl/src/lib.rs index d1deba4f99140..34c5be6ace21b 100644 --- a/src/ctl/src/lib.rs +++ b/src/ctl/src/lib.rs @@ -404,7 +404,10 @@ enum MetaCommands { /// get cluster info ClusterInfo, /// get source split info - SourceSplitInfo, + SourceSplitInfo { + #[clap(long)] + ignore_id: bool, + }, /// Reschedule the actors in the stream graph /// /// The format is `fragment_id-[worker_id:count]+[worker_id:count]` @@ -808,8 +811,8 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { Commands::Meta(MetaCommands::Pause) => cmd_impl::meta::pause(context).await?, Commands::Meta(MetaCommands::Resume) => cmd_impl::meta::resume(context).await?, Commands::Meta(MetaCommands::ClusterInfo) => cmd_impl::meta::cluster_info(context).await?, - Commands::Meta(MetaCommands::SourceSplitInfo) => { - cmd_impl::meta::source_split_info(context).await? + Commands::Meta(MetaCommands::SourceSplitInfo { ignore_id }) => { + cmd_impl::meta::source_split_info(context, ignore_id).await? } Commands::Meta(MetaCommands::Reschedule { from, diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs index feb7a959083bb..e1605b0aa61dc 100644 --- a/src/meta/src/rpc/ddl_controller.rs +++ b/src/meta/src/rpc/ddl_controller.rs @@ -368,12 +368,14 @@ impl DdlController { } } + #[tracing::instrument(skip(self), level = "debug")] pub async fn alter_parallelism( &self, table_id: u32, parallelism: PbTableParallelism, mut deferred: bool, ) -> MetaResult<()> { + tracing::info!("alter parallelism"); if self.barrier_manager.check_status_running().is_err() { tracing::info!( "alter parallelism is set to deferred mode because the system is in recovery state" diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index 5fc916ceaaa48..4c296bd772467 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -183,17 +183,26 @@ impl CustomFragmentInfo { } } +use educe::Educe; + +// The debug implementation is arbitrary. Just used in debug logs. +#[derive(Educe)] +#[educe(Debug)] pub struct RescheduleContext { /// Meta information for all Actors + #[educe(Debug(ignore))] actor_map: HashMap, /// Status of all Actors, used to find the location of the `Actor` actor_status: BTreeMap, /// Meta information of all `Fragment`, used to find the `Fragment`'s `Actor` + #[educe(Debug(ignore))] fragment_map: HashMap, /// Index of all `Actor` upstreams, specific to `Dispatcher` upstream_dispatchers: HashMap>, - /// Fragments with stream source + /// Fragments with `StreamSource` stream_source_fragment_ids: HashSet, + /// Fragments with `StreamSourceBackfill` + stream_source_backfill_fragment_ids: HashSet, /// Target fragments in `NoShuffle` relation no_shuffle_target_fragment_ids: HashSet, /// Source fragments in `NoShuffle` relation @@ -768,6 +777,7 @@ impl ScaleController { } let mut stream_source_fragment_ids = HashSet::new(); + let mut stream_source_backfill_fragment_ids = HashSet::new(); let mut no_shuffle_reschedule = HashMap::new(); for (fragment_id, WorkerReschedule { worker_actor_diff }) in &*reschedule { let fragment = fragment_map @@ -796,6 +806,7 @@ impl ScaleController { // correspondence, so we need to clone the reschedule plan to the downstream of all // cascading relations. if no_shuffle_source_fragment_ids.contains(fragment_id) { + // This fragment is a NoShuffle's upstream. let mut queue: VecDeque<_> = fragment_dispatcher_map .get(fragment_id) .unwrap() @@ -885,6 +896,17 @@ impl ScaleController { "reschedule plan rewritten with NoShuffle reschedule {:?}", no_shuffle_reschedule ); + + for noshuffle_downstream in no_shuffle_reschedule.keys() { + let fragment = fragment_map.get(noshuffle_downstream).unwrap(); + // SourceScan is always a NoShuffle downstream, rescheduled together with the upstream Source. + if (fragment.get_fragment_type_mask() & FragmentTypeFlag::SourceScan as u32) != 0 { + let stream_node = fragment.actor_template.nodes.as_ref().unwrap(); + if stream_node.find_source_backfill().is_some() { + stream_source_backfill_fragment_ids.insert(fragment.fragment_id); + } + } + } } // Modifications for NoShuffle downstream. @@ -896,6 +918,7 @@ impl ScaleController { fragment_map, upstream_dispatchers, stream_source_fragment_ids, + stream_source_backfill_fragment_ids, no_shuffle_target_fragment_ids, no_shuffle_source_fragment_ids, fragment_dispatcher_map, @@ -922,9 +945,11 @@ impl ScaleController { HashMap, HashMap>, )> { + tracing::debug!("build_reschedule_context, reschedules: {:#?}", reschedules); let ctx = self .build_reschedule_context(&mut reschedules, options, table_parallelisms) .await?; + tracing::debug!("reschedule context: {:#?}", ctx); let reschedules = reschedules; // Here, the plan for both upstream and downstream of the NO_SHUFFLE Fragment should already have been populated. @@ -1262,9 +1287,9 @@ impl ScaleController { } } - // For stream source fragments, we need to reallocate the splits. + // For stream source & source backfill fragments, we need to reallocate the splits. // Because we are in the Pause state, so it's no problem to reallocate - let mut fragment_stream_source_actor_splits = HashMap::new(); + let mut fragment_actor_splits = HashMap::new(); for fragment_id in reschedules.keys() { let actors_after_reschedule = fragment_actors_after_reschedule.get(fragment_id).unwrap(); @@ -1282,13 +1307,51 @@ impl ScaleController { let actor_splits = self .source_manager - .migrate_splits(*fragment_id, &prev_actor_ids, &curr_actor_ids) + .migrate_splits_for_source_actors( + *fragment_id, + &prev_actor_ids, + &curr_actor_ids, + ) .await?; - fragment_stream_source_actor_splits.insert(*fragment_id, actor_splits); + tracing::debug!( + "source actor splits: {:?}, fragment_id: {}", + actor_splits, + fragment_id + ); + fragment_actor_splits.insert(*fragment_id, actor_splits); + } + } + // We use 2 iterations to make sure source actors are migrated first, and then align backfill actors + if !ctx.stream_source_backfill_fragment_ids.is_empty() { + for fragment_id in reschedules.keys() { + let actors_after_reschedule = + fragment_actors_after_reschedule.get(fragment_id).unwrap(); + + if ctx + .stream_source_backfill_fragment_ids + .contains(fragment_id) + { + let fragment = ctx.fragment_map.get(fragment_id).unwrap(); + + let curr_actor_ids = actors_after_reschedule.keys().cloned().collect_vec(); + + let actor_splits = self.source_manager.migrate_splits_for_backfill_actors( + *fragment_id, + &fragment.upstream_fragment_ids, + &curr_actor_ids, + &fragment_actor_splits, + &no_shuffle_upstream_actor_map, + )?; + tracing::debug!( + "source backfill actor splits: {:?}, fragment_id: {}", + actor_splits, + fragment_id + ); + fragment_actor_splits.insert(*fragment_id, actor_splits); + } } } - // TODO: support migrate splits for SourceBackfill // Generate fragment reschedule plan let mut reschedule_fragment: HashMap = @@ -1426,7 +1489,7 @@ impl ScaleController { let upstream_fragment_dispatcher_ids = upstream_fragment_dispatcher_set.into_iter().collect_vec(); - let actor_splits = fragment_stream_source_actor_splits + let actor_splits = fragment_actor_splits .get(&fragment_id) .cloned() .unwrap_or_default(); @@ -1477,6 +1540,8 @@ impl ScaleController { .pre_apply_reschedules(fragment_created_actors) .await; + tracing::debug!("analyze_reschedule_plan result: {:#?}", reschedule_fragment); + Ok((reschedule_fragment, applied_reschedules)) } @@ -1865,12 +1930,12 @@ impl ScaleController { actor_location: &mut HashMap, table_fragment_id_map: &mut HashMap>, fragment_actor_id_map: &mut HashMap>, - table_fragments: &BTreeMap, + all_table_fragments: &BTreeMap, ) -> MetaResult<()> { // This is only for assertion purposes and will be removed once the dispatcher_id is guaranteed to always correspond to the downstream fragment_id, // such as through the foreign key constraints in the SQL backend. let mut actor_fragment_id_map_for_check = HashMap::new(); - for table_fragments in table_fragments.values() { + for table_fragments in all_table_fragments.values() { for (fragment_id, fragment) in &table_fragments.fragments { for actor in &fragment.actors { let prev = @@ -1881,7 +1946,7 @@ impl ScaleController { } } - for (table_id, table_fragments) in table_fragments { + for (table_id, table_fragments) in all_table_fragments { for (fragment_id, fragment) in &table_fragments.fragments { for actor in &fragment.actors { fragment_actor_id_map @@ -1909,8 +1974,15 @@ impl ScaleController { dispatcher.dispatcher_id as FragmentId ); } else { + tracing::error!( + "downstream actor id {} from actor {} (fragment {}) not found in actor_fragment_id_map_for_check: {actor_fragment_id_map_for_check:?}\n\ndispatchers: {:#?}", + downstream_actor_id, + actor.actor_id, + actor.fragment_id, + actor.dispatcher + ); bail!( - "downstream actor id {} from actor {} not found in fragment_actor_id_map", + "downstream actor id {} from actor {} not found", downstream_actor_id, actor.actor_id, ); @@ -2027,6 +2099,17 @@ impl ScaleController { .await?; } } + tracing::debug!( + ?worker_ids, + ?table_parallelisms, + ?no_shuffle_source_fragment_ids, + ?no_shuffle_target_fragment_ids, + ?fragment_distribution_map, + ?actor_location, + ?table_fragment_id_map, + ?fragment_actor_id_map, + "generate_table_resize_plan, after build_index" + ); let mut target_plan = HashMap::new(); @@ -2147,7 +2230,10 @@ impl ScaleController { } target_plan.retain(|_, plan| !plan.worker_actor_diff.is_empty()); - + tracing::debug!( + ?target_plan, + "generate_table_resize_plan finished target_plan" + ); Ok(target_plan) } @@ -2378,6 +2464,7 @@ impl ScaleController { /// At present, for table level scaling, we use the strategy `TableResizePolicy`. /// Currently, this is used as an internal interface, so it won’t be included in Protobuf. +#[derive(Debug)] pub struct TableResizePolicy { pub(crate) worker_ids: BTreeSet, pub(crate) table_parallelisms: HashMap, diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs index a383bfee8e46a..751ee92beebc1 100644 --- a/src/meta/src/stream/source_manager.rs +++ b/src/meta/src/stream/source_manager.rs @@ -231,7 +231,8 @@ pub struct SourceManagerCore { /// `source_id` -> `(fragment_id, upstream_fragment_id)` backfill_fragments: HashMap>, - /// Splits assigned per actor + /// Splits assigned per actor, + /// incl. both `Source` and `SourceBackfill`. actor_splits: HashMap>, } @@ -468,13 +469,13 @@ impl Default for SplitDiffOptions { } /// Reassigns splits if there are new splits or dropped splits, -/// i.e., `actor_splits` and `discovered_splits` differ. +/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled. /// /// The existing splits will remain unmoved in their currently assigned actor. /// /// If an actor has an upstream actor, it should be a backfill executor, -/// and its splits should be aligned with the upstream actor. `reassign_splits` should not be used in this case. -/// Use `align_backfill_splits` instead. +/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case. +/// Use `align_backfill_splits` instead.** /// /// - `fragment_id`: just for logging /// @@ -790,11 +791,10 @@ impl SourceManager { /// Migrates splits from previous actors to the new actors for a rescheduled fragment. /// - /// Very occasionally split removal may happen - /// during scaling, in which case we need to use the old splits for reallocation instead of the - /// latest splits (which may be missing), so that we can resolve the split removal in the next - /// command. - pub async fn migrate_splits( + /// Very occasionally split removal may happen during scaling, in which case we need to + /// use the old splits for reallocation instead of the latest splits (which may be missing), + /// so that we can resolve the split removal in the next command. + pub async fn migrate_splits_for_source_actors( &self, fragment_id: FragmentId, prev_actor_ids: &[ActorId], @@ -817,7 +817,7 @@ impl SourceManager { fragment_id, empty_actor_splits, &prev_splits, - // pre-allocate splits is the first time getting splits and it does not have scale in scene + // pre-allocate splits is the first time getting splits and it does not have scale-in scene SplitDiffOptions::default(), ) .unwrap_or_default(); @@ -825,6 +825,43 @@ impl SourceManager { Ok(diff) } + /// Migrates splits from previous actors to the new actors for a rescheduled fragment. + pub fn migrate_splits_for_backfill_actors( + &self, + fragment_id: FragmentId, + upstream_fragment_ids: &Vec, + curr_actor_ids: &[ActorId], + fragment_actor_splits: &HashMap>>, + no_shuffle_upstream_actor_map: &HashMap>, + ) -> MetaResult>> { + // align splits for backfill fragments with its upstream source fragment + debug_assert!(upstream_fragment_ids.len() == 1); + let upstream_fragment_id = upstream_fragment_ids[0]; + let actors = no_shuffle_upstream_actor_map + .iter() + .filter(|(id, _)| curr_actor_ids.contains(id)) + .map(|(id, upstream_fragment_actors)| { + debug_assert!(upstream_fragment_actors.len() == 1); + ( + *id, + vec![*upstream_fragment_actors.get(&upstream_fragment_id).unwrap()], + ) + }); + let upstream_assignment = fragment_actor_splits.get(&upstream_fragment_id).unwrap(); + tracing::info!( + fragment_id, + upstream_fragment_id, + ?upstream_assignment, + "migrate_splits_for_backfill_actors" + ); + Ok(align_backfill_splits( + actors, + upstream_assignment, + fragment_id, + upstream_fragment_id, + )?) + } + /// Allocates splits to actors for a newly created source executor. pub async fn allocate_splits(&self, table_id: &TableId) -> MetaResult { let core = self.core.lock().await; diff --git a/src/tests/simulation/Cargo.toml b/src/tests/simulation/Cargo.toml index 8729207c0d025..c82f2b7d5911e 100644 --- a/src/tests/simulation/Cargo.toml +++ b/src/tests/simulation/Cargo.toml @@ -25,6 +25,7 @@ glob = "0.3" itertools = { workspace = true } lru = { workspace = true } madsim = "0.2.30" +maplit = "1" paste = "1" pin-project = "1.1" pretty_assertions = "1" diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs index 26fdc3a8757e1..a9ffba0063562 100644 --- a/src/tests/simulation/src/cluster.rs +++ b/src/tests/simulation/src/cluster.rs @@ -158,27 +158,16 @@ impl Configuration { /// Provides a configuration for scale test which ensures that the arrangement backfill is disabled, /// so table scan will use `no_shuffle`. pub fn for_scale_no_shuffle() -> Self { - // Embed the config file and create a temporary file at runtime. The file will be deleted - // automatically when it's dropped. - let config_path = { - let mut file = - tempfile::NamedTempFile::new().expect("failed to create temp config file"); - file.write_all(include_bytes!("risingwave-scale.toml")) - .expect("failed to write config file"); - file.into_temp_path() - }; + let mut conf = Self::for_scale(); + conf.per_session_queries = + vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".into()].into(); + conf + } - Configuration { - config_path: ConfigPath::Temp(config_path.into()), - frontend_nodes: 2, - compute_nodes: 3, - meta_nodes: 3, - compactor_nodes: 2, - compute_node_cores: 2, - per_session_queries: vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".into()] - .into(), - ..Default::default() - } + pub fn for_scale_shared_source() -> Self { + let mut conf = Self::for_scale(); + conf.per_session_queries = vec!["SET RW_ENABLE_SHARED_SOURCE = true;".into()].into(); + conf } pub fn for_auto_parallelism( diff --git a/src/tests/simulation/src/ctl_ext.rs b/src/tests/simulation/src/ctl_ext.rs index 9b57673e49c16..3986a826e21e7 100644 --- a/src/tests/simulation/src/ctl_ext.rs +++ b/src/tests/simulation/src/ctl_ext.rs @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![cfg_attr(not(madsim), expect(unused_imports))] - -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ffi::OsString; use std::fmt::Write; use std::sync::Arc; @@ -23,17 +21,17 @@ use anyhow::{anyhow, Result}; use cfg_or_panic::cfg_or_panic; use clap::Parser; use itertools::Itertools; -use rand::seq::{IteratorRandom, SliceRandom}; +use rand::seq::IteratorRandom; use rand::{thread_rng, Rng}; use risingwave_common::catalog::TableId; use risingwave_common::hash::WorkerSlotId; +use risingwave_connector::source::{SplitImpl, SplitMetaData}; use risingwave_hummock_sdk::{CompactionGroupId, HummockSstableId}; use risingwave_pb::meta::table_fragments::fragment::FragmentDistributionType; use risingwave_pb::meta::table_fragments::PbFragment; use risingwave_pb::meta::update_worker_node_schedulability_request::Schedulability; use risingwave_pb::meta::GetClusterInfoResponse; use risingwave_pb::stream_plan::StreamNode; -use serde::de::IntoDeserializer; use self::predicate::BoxedPredicate; use crate::cluster::Cluster; @@ -76,7 +74,7 @@ pub mod predicate { Box::new(p) } - /// There exists operators whose identity contains `s` in the fragment. + /// There exists operators whose identity contains `s` in the fragment (case insensitive). pub fn identity_contains(s: impl Into) -> BoxedPredicate { let s: String = s.into(); let p = move |f: &PbFragment| { @@ -363,6 +361,30 @@ impl Cluster { Ok(response) } + /// `table_id -> actor_id -> splits` + pub async fn list_source_splits(&self) -> Result>> { + let info = self.get_cluster_info().await?; + let mut res = BTreeMap::new(); + + for table in info.table_fragments { + let mut table_actor_splits = BTreeMap::new(); + + for (actor_id, splits) in table.actor_splits { + let splits = splits + .splits + .iter() + .map(|split| SplitImpl::try_from(split).unwrap()) + .map(|split| split.id()) + .collect_vec() + .join(","); + table_actor_splits.insert(actor_id, splits); + } + res.insert(table.table_id, table_actor_splits); + } + + Ok(res) + } + // update node schedulability #[cfg_or_panic(madsim)] async fn update_worker_node_schedulability( diff --git a/src/tests/simulation/tests/integration_tests/scale/mod.rs b/src/tests/simulation/tests/integration_tests/scale/mod.rs index f6940f072409e..3c7a702dc6290 100644 --- a/src/tests/simulation/tests/integration_tests/scale/mod.rs +++ b/src/tests/simulation/tests/integration_tests/scale/mod.rs @@ -20,6 +20,7 @@ mod nexmark_q4; mod nexmark_source; mod no_shuffle; mod schedulability; +mod shared_source; mod singleton_migration; mod sink; mod streaming_parallelism; diff --git a/src/tests/simulation/tests/integration_tests/scale/shared_source.rs b/src/tests/simulation/tests/integration_tests/scale/shared_source.rs new file mode 100644 index 0000000000000..175b3a043100c --- /dev/null +++ b/src/tests/simulation/tests/integration_tests/scale/shared_source.rs @@ -0,0 +1,192 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; + +use anyhow::Result; +use itertools::Itertools; +use maplit::{convert_args, hashmap}; +use risingwave_common::hash::WorkerSlotId; +use risingwave_pb::meta::table_fragments::Fragment; +use risingwave_simulation::cluster::{Cluster, Configuration}; +use risingwave_simulation::ctl_ext::predicate::{identity_contains, no_identity_contains}; + +const CREATE_SOURCE: &str = r#" +CREATE SOURCE s(v1 int, v2 varchar) WITH ( + connector='kafka', + properties.bootstrap.server='192.168.11.1:29092', + topic='shared_source' +) FORMAT PLAIN ENCODE JSON;"#; + +fn actor_upstream(fragment: &Fragment) -> Vec<(u32, Vec)> { + fragment + .actors + .iter() + .map(|actor| (actor.actor_id, actor.upstream_actor_id.clone())) + .collect_vec() +} + +async fn validate_splits_aligned(cluster: &mut Cluster) -> Result<()> { + let source_backfill_fragment = cluster + .locate_one_fragment([identity_contains("StreamSourceScan")]) + .await?; + // The result of scaling is non-deterministic. + // So we just print the result here, instead of asserting with a fixed value. + let actor_upstream = actor_upstream(&source_backfill_fragment.inner); + tracing::info!( + "{}", + actor_upstream + .iter() + .format_with("\n", |(actor_id, upstream), f| f(&format_args!( + "{} <- {:?}", + actor_id, upstream + ))) + ); + let splits = cluster.list_source_splits().await?; + tracing::info!("{:#?}", splits); + let actor_splits: BTreeMap = splits + .values() + .flat_map(|m| m.clone().into_iter()) + .collect(); + for (actor, upstream) in actor_upstream { + assert!(upstream.len() == 1, "invalid upstream: {:?}", upstream); + let upstream_actor = upstream[0]; + assert_eq!( + actor_splits.get(&actor).unwrap(), + actor_splits.get(&upstream_actor).unwrap() + ); + } + Ok(()) +} + +#[tokio::test] +async fn test_shared_source() -> Result<()> { + tracing_subscriber::fmt::Subscriber::builder() + .with_max_level(tracing::Level::ERROR) + .with_env_filter("risingwave_stream::executor::source::source_backfill_executor=DEBUG,integration_tests=DEBUG") + .init(); + + let mut cluster = Cluster::start(Configuration::for_scale_shared_source()).await?; + cluster.create_kafka_topics(convert_args!(hashmap!( + "shared_source" => 4, + ))); + let mut session = cluster.start_session(); + + session.run("set rw_implicit_flush = true;").await?; + + session.run(CREATE_SOURCE).await?; + session + .run("create materialized view mv as select count(*) from s group by v1;") + .await?; + let source_fragment = cluster + .locate_one_fragment([ + identity_contains("Source"), + no_identity_contains("StreamSourceScan"), + ]) + .await?; + let source_workers = source_fragment.all_worker_count().into_keys().collect_vec(); + let source_backfill_fragment = cluster + .locate_one_fragment([identity_contains("StreamSourceScan")]) + .await?; + let source_backfill_workers = source_backfill_fragment + .all_worker_count() + .into_keys() + .collect_vec(); + let hash_agg_fragment = cluster + .locate_one_fragment([identity_contains("hashagg")]) + .await?; + let hash_agg_workers = hash_agg_fragment + .all_worker_count() + .into_keys() + .collect_vec(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 6 + 2 3 HASH {4,3} {3} {MVIEW} 6 + 3 3 HASH {5} {1} {SOURCE_SCAN} 6"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" + 1 CREATED ADAPTIVE + 3 CREATED ADAPTIVE"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + + // SourceBackfill cannot be scaled because of NoShuffle. + assert!( + &cluster + .reschedule( + source_backfill_fragment + .reschedule([WorkerSlotId::new(source_backfill_workers[0], 0)], []), + ) + .await.unwrap_err().to_string().contains("rescheduling NoShuffle downstream fragment (maybe Chain fragment) is forbidden, please use NoShuffle upstream fragment (like Materialized fragment) to scale"), + ); + + // hash agg can be scaled independently + cluster + .reschedule(hash_agg_fragment.reschedule([WorkerSlotId::new(hash_agg_workers[0], 0)], [])) + .await + .unwrap(); + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 6 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 6"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + + // source is the NoShuffle upstream. It can be scaled, and the downstream SourceBackfill will be scaled together. + cluster + .reschedule(source_fragment.reschedule( + [ + WorkerSlotId::new(source_workers[0], 0), + WorkerSlotId::new(source_workers[0], 1), + WorkerSlotId::new(source_workers[2], 0), + ], + [], + )) + .await + .unwrap(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 3 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 3"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" + 1 CREATED CUSTOM + 3 CREATED CUSTOM"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + + // resolve_no_shuffle for backfill fragment is OK, which will scale the upstream together. + cluster + .reschedule_resolve_no_shuffle(source_backfill_fragment.reschedule( + [], + [ + WorkerSlotId::new(source_workers[0], 0), + WorkerSlotId::new(source_workers[0], 1), + WorkerSlotId::new(source_workers[2], 0), + WorkerSlotId::new(source_workers[2], 1), + ], + )) + .await + .unwrap(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 7 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 7"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" +1 CREATED CUSTOM +3 CREATED CUSTOM"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + Ok(()) +} From 71753f11223cd0b4d67ba6baf7ac636fa246e7fc Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Fri, 6 Sep 2024 17:02:32 +0800 Subject: [PATCH 17/32] fix(meta): correctly update serving parallelism mapping (#18439) --- src/meta/src/serving/mod.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/meta/src/serving/mod.rs b/src/meta/src/serving/mod.rs index 69e17a978212e..86277c0f2a501 100644 --- a/src/meta/src/serving/mod.rs +++ b/src/meta/src/serving/mod.rs @@ -192,7 +192,16 @@ pub async fn start_serving_vnode_mapping_worker( continue; } let (workers, streaming_parallelisms) = fetch_serving_infos(&metadata_manager).await; - let (upserted, failed) = serving_vnode_mapping.upsert(streaming_parallelisms, &workers); + let filtered_streaming_parallelisms = fragment_ids.iter().filter_map(|frag_id|{ + match streaming_parallelisms.get(frag_id) { + Some(parallelism) => Some((*frag_id, *parallelism)), + None => { + tracing::warn!(fragment_id = *frag_id, "streaming parallelism not found"); + None + } + } + }).collect(); + let (upserted, failed) = serving_vnode_mapping.upsert(filtered_streaming_parallelisms, &workers); if !upserted.is_empty() { tracing::debug!("Update serving vnode mapping for fragments {:?}.", upserted.keys()); notification_manager.notify_frontend_without_version(Operation::Update, Info::ServingWorkerSlotMappings(FragmentWorkerSlotMappings{ mappings: to_fragment_worker_slot_mapping(&upserted) })); From becb896acd442d32eab4c14dccf558a4b346c643 Mon Sep 17 00:00:00 2001 From: Li0k Date: Sun, 8 Sep 2024 01:24:11 +0800 Subject: [PATCH 18/32] fix(iceberg): fix select empty iceberg table (#18449) --- ci/scripts/e2e-iceberg-sink-v2-test.sh | 1 + .../test_case/iceberg_select_empty_table.slt | 60 +++++++++++++++++++ .../test_case/iceberg_select_empty_table.toml | 11 ++++ src/connector/src/source/iceberg/mod.rs | 19 ++++-- 4 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 e2e_test/iceberg/test_case/iceberg_select_empty_table.slt create mode 100644 e2e_test/iceberg/test_case/iceberg_select_empty_table.toml diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh index dd2f78037a5f2..1a46f30682bdd 100755 --- a/ci/scripts/e2e-iceberg-sink-v2-test.sh +++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh @@ -45,6 +45,7 @@ poetry run python main.py -t ./test_case/partition_upsert.toml poetry run python main.py -t ./test_case/range_partition_append_only.toml poetry run python main.py -t ./test_case/range_partition_upsert.toml poetry run python main.py -t ./test_case/append_only_with_checkpoint_interval.toml +poetry run python main.py -t ./test_case/iceberg_select_empty_table.toml echo "--- Kill cluster" diff --git a/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt b/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt new file mode 100644 index 0000000000000..832a7b781f7fb --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt @@ -0,0 +1,60 @@ +statement ok +set sink_decouple = false; + +statement ok +set streaming_parallelism=4; + +statement ok +CREATE TABLE s1 (i1 int, i2 varchar, i3 varchar); + +statement ok +CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM s1; + +statement ok +CREATE SINK sink1 AS select * from mv1 WITH ( + connector = 'iceberg', + type = 'append-only', + force_append_only = 'true', + database.name = 'demo_db', + table.name = 't1', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + commit_checkpoint_interval = 1, + create_table_if_not_exists = 'true' +); + +statement ok +CREATE SOURCE iceberg_t1_source +WITH ( + connector = 'iceberg', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + database.name = 'demo_db', + table.name = 't1', +); + +statement ok +flush; + +query I +select count(*) from iceberg_t1_source; +---- +0 + +statement ok +DROP SINK sink1; + +statement ok +DROP SOURCE iceberg_t1_source; + +statement ok +DROP TABLE s1 cascade; diff --git a/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml b/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml new file mode 100644 index 0000000000000..fa6eeff134c26 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml @@ -0,0 +1,11 @@ +init_sqls = [ + 'CREATE SCHEMA IF NOT EXISTS demo_db', + 'DROP TABLE IF EXISTS demo_db.t1', +] + +slt = 'test_case/iceberg_select_empty_table.slt' + +drop_sqls = [ + 'DROP TABLE IF EXISTS demo_db.t1', + 'DROP SCHEMA IF EXISTS demo_db', +] diff --git a/src/connector/src/source/iceberg/mod.rs b/src/connector/src/source/iceberg/mod.rs index f101ff9ed6d4b..d65929faafba1 100644 --- a/src/connector/src/source/iceberg/mod.rs +++ b/src/connector/src/source/iceberg/mod.rs @@ -206,6 +206,17 @@ impl IcebergSplitEnumerator { bail!("Batch parallelism is 0. Cannot split the iceberg files."); } let table = self.config.load_table_v2().await?; + let current_snapshot = table.metadata().current_snapshot(); + if current_snapshot.is_none() { + // If there is no snapshot, we will return a mock `IcebergSplit` with empty files. + return Ok(vec![IcebergSplit { + split_id: 0, + snapshot_id: 0, // unused + table_meta: TableMetadataJsonStr::serialize(table.metadata()), + files: vec![], + }]); + } + let snapshot_id = match time_traval_info { Some(IcebergTimeTravelInfo::Version(version)) => { let Some(snapshot) = table.metadata().snapshot_by_id(version) else { @@ -232,10 +243,10 @@ impl IcebergSplitEnumerator { } } } - None => match table.metadata().current_snapshot() { - Some(snapshot) => snapshot.snapshot_id(), - None => bail!("Cannot find the current snapshot id in the iceberg table."), - }, + None => { + assert!(current_snapshot.is_some()); + current_snapshot.unwrap().snapshot_id() + } }; let mut files = vec![]; From 9a03718accb10e75cc92b0d27c7ec58cdf0b0c64 Mon Sep 17 00:00:00 2001 From: stonepage <40830455+st1page@users.noreply.github.com> Date: Mon, 9 Sep 2024 13:49:32 +0800 Subject: [PATCH 19/32] doc(readme): update architecture image (#18418) Co-authored-by: WanYixian <150207222+WanYixian@users.noreply.github.com> --- README.md | 2 +- docs/dev/src/images/architecture_20240908.png | Bin 0 -> 348502 bytes 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 docs/dev/src/images/architecture_20240908.png diff --git a/README.md b/README.md index 7128dccede28b..4c0e043b71513 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ RisingWave is a Postgres-compatible SQL database engineered to provide the ingest millions of events per second, continuously join and analyze live data streams with historical tables, serve ad-hoc queries in real-time, and deliver fresh, consistent results wherever needed. -![RisingWave](./docs/dev/src/images/architecture_20240814.png) +![RisingWave](./docs/dev/src/images/architecture_20240908.png) ## Try it out in 60 seconds diff --git a/docs/dev/src/images/architecture_20240908.png b/docs/dev/src/images/architecture_20240908.png new file mode 100644 index 0000000000000000000000000000000000000000..40ba8b8174c689afb53e1ce9101bbdb20b7bc853 GIT binary patch literal 348502 zcmeFZWn7e7+de!q3=$$CAV?zu0wN_{gD4@TDBay1LoNlCgVj&`ON)+PXe%-cvcEcIutHxjizgoX8huw`&+ac(JtWNg0D7~+9Rg^M$%HJ=axDXUmhMu0`hZxS9Ap&DPjD86@Ntu5|0EdQhp)rIP; zy*OL!@a>8hhT{RUnJWw(Ze(A9l;2UiPP)_{6BgD&=Yj!V0N6lWHnuXlJLRum>4#v4{$U)b0P?iYa{h*-=b2(00oSaql+wFS zJDokC8wwDBM3CMPYRUMFiy=VRK>DJKaKX`#ZphEAh^|@OqsJ`V*m6MsZmF2cSJP`d zixt=1wKa0F`a0BylFwJ2fo4x=pS3E5z72>06=@`L-orB_eY5Vwoho_7yJ=2g_cLYJe2_q>d;UqN%7Z@H)maibi_g}G6A=^p9hG%J%1=}NXP{< z4_=+ot-zwe4HYXC{4PIXV@@-5hNFZ-{Zjhn>PyA1>nvYEtx6gpSP4(W^$TyXh!26_ zu#a6892+=;pSbgbSAoJ4I86pKlbppo#-sl3?b%-sWl!vQ4rgrfj^9(a+;Inw23YU! z%8*i$E*g+($GIwPBfOzHr+Cs>(#!zkDYC}ZRfW%|)qdu1@5LMEK$zJs0)cqHscp{#IMJS&^*t0e zYkZl6ZvyC|UXW#U$HkAP#@=H8gqw6k2F4%!5&D1z&iBTC&V{|fMW1Ulmb?@ zLFxXZ`oKFNMtw|m3|53xA?6yO`ioBwAml~(G+qVvJ^$6|d)lHp)0j&bn+P+ogbenz zPYhLWhzvtUZiQeIZ$CJF|Kv8VEab!c`ww5rydQ0O@EOeVh&8NJ=2|gqSE{~9T=Pvnmu@jk^E08s~XKfOWmqA5AN=e{$=$Iu%o?LCgA6v$UYsm#5BOD?6v z-hKy0fM-pmTt*%EMuygRspP&XbK&rju{E_7TIS+gy+9qY4!zwsq;Nw5k8S(gIQMK zR*1}POJer7`CA(Dbf$!6MA6rWiBW_~gu-t^-=5vRE*otuw{!bdc*SRJWg-h|3$m@MR*^)&$oXqSgIuXsOP#xWvNLi#Td&N#uyf9)w^0;=~o5ZN@>bzy7TYa zgI=5TfDfu<>v8yZt~4b>CSu!(t0vekHr->`)*rmS~}k6xi5DgYE^D^U}0e{ zXwhiV*^@mWls=dhY@ucDR9aFRZM8XE_vy^y@RN^8CImUZ*J?FeGUl?fn=zg zg?70lC{^L<_#)+zkyY2KQ2X|xl(jOo%N~?_imUq}160dgANY&i=FZGldsMPh{?Sy$#-;P4d1q-?sji*(=#?^u za#?s*X{B-CSk(xw&G5Ku(5)axN=6D5(Uo-S!4Q*ZGc_wzkM6kGKpmp)%lW){N$=ZK zwGc7k<2e5k$*2a^fNG=rP2f-#^8Wh^WvRgo-9>%dW9nlgu%g{2z7X$uCp0Eo? zkJLx}E4QKMxA!uJ#t)56=K3?ao|dF`N%f~@JlqWOA)BQv5m~U!Rmr_y&Zx+Hk0qaJ z!@EY$%ff5%WiRo9d__!hG@ZP;{B6c2Mq9bHq$P2O&Tsj*7eA+brVPK!+!^ESQ#?7B z+uEF-sW_u37$z?hqBI$M`iinEZ{Z-rUO8L&*v)!UbMk5(fiG{p^fb#IQW3d*d3J?$ z#a(%=$f59!1l_1fHsh;oFAKO|aPyhgbmMhTBnpO+CY3R3u?P#E6v{|OuvjKo3b-^4 z;1}gp$V>=!dXIgy@cuy9T26J_Oj*$=pXv0yUAU&K?|ZS1-L{;L6HZavW51g#JP&!s zOvgI3xpy^uAm?@^TN*VQ>?Ofw95HA1_*PvDl`RtymF&Cni=VbT-*p*p6kRY|^ip*yXy^8D|PCgXL4Y41oc5u`^`Tj&% zb;pFoZuym7^Qq?fDx!G6WW8M{%1&KEn_knwC=Jy{owky8loqNMtfx}{v8c*zf1LN( zP&|hd>v%;}v1(O$iAZUUL&9hk>1y5c&uZEQdwCPj()7wcubg?DAG1+<@eSDAA5kyf z?q`}w*NVg7A6Cs8XlUvgS*# zm<^$LKl3~DG2M5i!WWz9o9lI6dUN)ET{<{am8M^){8V7xZD!kV;dJ4g_&aeebHAdP zZET+-YT)IwVNC|DaixHGF(}h?M^;I0N#~+SH?`=d&-oVS!mHPT_sCYqVoJqyfMAte#7WRv!urDrt-XxnBo&T4U7FMO#e(uW9B-Y1+XYv2IlNO(Oi#2j z;Ou8aX9SZb^(UuwhyEJ@)U>g*-9GOw_SSayI6iaC>*dw+d0#zpwWIUYhjc{R&)0*< zt<5c20R?ajh#UZhk1~sR`|d1hg0L}BB?RI7^_^(Sgf<}4|6=of?bhZ36TlPxb)9TL zO#wvr;tC{)64D$(Z7b=?dsL@v`mlmvgye*mQ5f@-^4Xf8Z_CLdPgs_aUsCr4$mZ1R zL(6kPVsuUckbNle+zq%^Yn6LuDC(&SJX^cYB%9)ykAZ()b|R5YjsgJ> zdDA&3g46URVP;YK(E895+WvZ)kZ!sY`d_gF02Y#k!Jy2@*S-G*^UxbW5OJG- z@ADt|;`{*s+*GW(mP-F$5eb9wPyI*H{k`qdZ6Ihe!I@G@^1mYzXjg~v-;viN3j)^& zFY^Vx{(HCoL?md^3+w;TxL*q;hodzK+Gj(4?Y|=uTyuo~U#c`S4&m0u=W$Mo|Hc3S zmkLe&pJ@79#r~f{|3g;)Poe)*$p8N+w4xu{4J?uQH8eC>KcJ+fJo`vq!op~8Z_k-M zH8Dq%P85`QqtthNQAFXtl;S!L><@`6G+;^I#`xoYcu4mu7F z4z8~+j=_}@zMI8X?(q(9pHAzG$4wp_xawuJ)`Vc2PYsFXe?E(=pOM!0h~MU~Y3^c= z{TQ-*G_`em2C9LRoT$<|e2Tvr`;1wh*#%nVyI9}A&$lP+^@3BOmM+;_`|WO-Zbi5( zUOYfe^~x4z4_BOc7l!>_FNyc{eDOZ*yPNe>C=^PXtbH4LaaIe5i&wdM9NYzN)_H1^ z^=(Yn#wqM}#PTd-I(bb(lcp-2nB%@L_h*Fjm?@&XtNfFu62_q=ff-uM*Q_!&X3aFF z>IprotCzXXbW_?b2t?3<1a+?HkA&OJH}91!b}@~Qug!X+1brJ$HfCmbtD&2^2mD@6 zP7H#c2W#6>od!9)I1`DrM`TX`Tz8Z2N)Fqbne9C?LX zCmrV>1_McU0ju905Q^3({6|FOBm_xcJ;vf5pHT=O5AxjD^}-qw0^Q5on;4WQ_=giS z=khx>c+1X5*#9ptyB81hpA;dj^bIk61-0Lc`!CO&5 zb`2X~7~Km+2fFzpn^z0L*4^`Okho^KZP@IT2Foqk<~;a+Qf^5sFlVW0ql*3Dk45#D z!4)BO&fjb(5TQJF;R+*tROs;5S!A7-zK!4UzB&?L^Yzn-$a1{wrI+YnQYZ$RfR~FM z1mU5NnM@Cwl@zRRr+Dy=;%~nV#J(>Hd$wmP;QBz0M&xF5r;x#Ysi8{*mFNaAMLdy1j_|~tzny=2Mw=08P6TBy_O+<1YgX--E*PJX3 z8aWv@i@r7jP`Y^m6iwg#*tH7Ec zf%no^{-KuW@cEGukrr+0o^tK48+YF$X6TvisV~`h^g&JeN)1p@`Hqf-+1!LmdI6AWKJ^L4VFd1pgPJ4 zk+WoKW;*;CGj3jr_B!jY8>&|X*Zv7ojvv6R(!IBD?_LIXL^#P${IGNftxVJ|RHl;2 z?*R2BIW2poGy$lABKfO-9sJU(FW~`gA8rKYaFU=eBc6k*hMM-vvto{K!E#Q%%Ps)@ zZqrXAPwapJYm?O#oDx1}WUxJN;s6w~q2r$x`x{y4tAL&Jf>&*FQfdzz>``MnFd(6M z5ZX|ho|LN)`6i67++2GBVE|WTq!9((@%xW(D>?2GJN4VCUt%XZW`roa7xBOR!SBda zC;-eaVecyRr%+cKkjOCL20zA*XU$5*{Rg!_4(Z2)V-GBo{kZhvugyD&>2^j%p56Z; zCnBJ#TLJd{hRdbVLR0(}{em>@r-}Hf5Yb(!V(nwU8Q=tu5&-f1bs7kcpPM61J7dc8 z{QyAzMkvQ>H@INfY>-+OG>P0Mjk;V}Ko(Zt$sGFx!*7zPC#0)uQW(xnj}6gx1Cy*R z*o%qO^BDbU%aCLfa0PLt3BJWQD$r!G{zIIlE-)%k(8W*TrTRS~f- z8>AR|oAzr{tMlDUf0}eA6C$+4k`HM!Lp_Pyr;fUGhvY=lN_*ItnAm$k08eN)Rhu)N z{R;rRG#>H*^RI1w%M|^5%v%6kjoH^%oamc=uGgmx-G$Fys1T~h13XdR@p><>35a-< z^TNwY`^D8U|MsMuBB183&Lcsw8V&#tT5*LG(0NA&GGH66Zap|l4^Ky04}1JLeX;jK z0G@7}+HEiC>>twA`?$u~QwJA-f1V}T3Vg^o$s20G0baQ-;A-2|y<{mj3f;{~8U;59twH1iAlE?J zia9_WB5daf0JHT^8)E{Vza9p^BL9nb>NtMS*lxa(_@N>CEMTvj!AWCsUjmh}b7Nr# z_B7BMQ)W&PaFh(Jep2pbpe1%G3~=IhH6DiZr|4h^2zKFDPr4A(kAyIYFOEG+-er)J zWDeUzuFXnb(4ZOj!;JZW=O3WszUl;i;oy-RCGJmuJFeU~wDq}o!nQ9R&t0_N9=nQI zwNw+NZiWtej{Htb5YyO)vGV?L0>o%G;mmpR7SOEZ`L{DQ*==#N*5-!i9JM3mhPheL z`g)m45FlFRiGJCZpY8SOs>l3i3BI9ep-(1+BKzmpSGSBSmXK zL@t)1F)m>y9pYRE_}UWNIT5A`b^21XYis^$894|gdQM$(g*~u_FU6WzuXi6s%1hbSk=06grk-E z%0pNVJBB0Qv&(2IjW(AbR)G+G^HpCkcJpx&x7Kg1*mycb_SArXV+hl#xu5!_#n z32@EHN$g8enM4;?k_*6n7(&f^bwY4Q-HMv)3Ivo}t^^5f5m#hjFccu6E8;{EMmy?_ z;@qX=1}4BtLJ5v25fk(PzeFOmQv7f%gxL0NH}vsRW9mP?#5m!s>AV{tjYbRyn=?lV zHwgj&%(e~NygchXXvTXV+;gJ515`v{t+=I&VuMawXkNy}-u`V>NM=NL@y1Em%}brX zw0KScNXTOLn++hu^c@D;cs(y6g(L%l+Cp+yV3+zjdZmCln}wgq?(7yqZ`lpsPSgB54Dr-%R`MPn@Fx(Y;lx!gmAEjQfKQwC^THzhZl#PyNs9 ze>?F%p9W$-NAK{%9DYmTpSSQ{GtWNGzV3>o+eT8mNB+FdVyL$W_2%b!2d5n=25 zr%%2ZF}}XoXhDQ>#Qq)h{(SJ?6i5)E-N+p%kH$}>|JUt*6LLn6hQ#7^#!SCi^ZUp> zk3k}=MUl2&?)-b5B}}w-{^0cAYcGiazZ)b#>4{+wUccZ{Sz$q84_*e%fAIFl*1MOd zqviMb1#O3O)HR#k-Qc+I!mB>c<8ppW++ z?ZajKxqk{Mc^+I5HROn8Km)$Ph)}G%BXRsI#Zgs)5IE5NhR{FdV#R>A3!9~zX!QHT zWQ2kFX`Nj`G(U;=DnyI(@VTx+cTmQ zm!a#@`d7eFvM^z-C|NQjp*jcfJggvEJXKPE6g<)#tw4VECwK2KqFGLIHS*7E>cqf% zQ3W_64G;9+f=3FXvB?Gr&53}vnpW?5I59z;MAO}FPIr63KgEZ}JfO+;TYrMBZ;!o1D(K*T2|Ysu@tpp5&C`^DkxsHZ<1+ohGCV)@EA(!$ zf|u^`e=6PGXn-6fx}9wO&wH74_Sq?kmVY13&xLq8Ww}ZearozBf35~;$0__;J&*rY zm2b)M`0i>+eNC94S%@6o_gW54U;P|?sZ zot&JsPHs%<-^8={euxfFrujdX>rnxT^ws1fHrW1S@1OK)%EI)aY{(Z+3Dw^q&d((r zSBLWEy4HnTH=ZV+9G6kHE-aWjE_5u>jw%Bw-U-8?9aIDmq#VkJDiC zWlMN`=5>BrxfZUn^~=!#Kvv8MjgcQ2!EZ(+zXYhY$t6Ge126*o28o{(o(!t#${idi z<}Zl}2*}uGNZ5OK=?O%OY$48a_``@O8R>j|SGClxW7f2PP(9GAb!6YIJnam|H;Kr+ zgoS9y=SBfS@9VMKNnQS32nIcDi~^8G5~_;>MXyHw(kU3k3}-8&rX;_?9-E4R6J6+# z=R{Vx&#La^4d;{OjBXGDPpVl*s55Te#B6fu$H4ie5PtjUxL&J!9)ypUn7#xE*LC+b z0+YUv8nk{JQG0iN@2cuAg|fUpt>muoEPMtwEh`^Pbi#R<^YgnMb=^E+sY{ zS(WMZPL@eFxUtS}C7kd!?kQ`eIN(=jS3M7!lQ5goDE6DEwH#<#5q*-CkQn!hDb#ZL zj!J>q8kh8_e|(HTW%_FA`o5?9u!fxB4A7~apx>@CT~GdM4hL&|X+j)#l)3qqm+kDR zg(suMqlSpiMkn24ZP8}p6UA7SoPxjw3uil8_~)8~iVi9*RV`y48_$qWLxYc=X`ega zlIa{6XDlg~C~9@#3SDn>ba>F{T29|%0&3iZ!xOA*S!v}g9v!3}s+%;v7WNFtA!N%6 zVIql3RV{O`zB3%RF7Rdasp1#?KFZWtctHOXu1c|?vJ~qE3bSWo!tZz7RVp7!mbnHG zRIXL>-G3PS(R~4WGPG_CpQ}Zh90$6XbtmYIRwppe#!YCJ3XhlTm2yd=loNPV6=_{x zyzdu|)^tK`AmE9Hlvw;nV?eMnOphzsfD)<&xGd!{>GA*D`uqVz?0J#ByiES4)^)Y z$d6K@)@l({yiIK?k?mLpTk;;LwAww~-)cp{_=;KDASD(BQxGIcQgREU_^1XS|lG0bTUByPZ zQt+&Fl9dPUz=N4Rk3h-sByk29J^AROml}txU7bQ=La_@UC2S_BF z_1$a-{zcn@QRH{tUGBV3`Q{I=CvvT>@ql&a_W8Re+rAX`4eFuG( zu5uGUN~X|-n?gx8NKaomb_)O}%U zE7grq1NZ8tQ}%3a*~&q4;ixj|@RM&sZG=l#{l25d$GghxD%P|~^VJeo6KKajy#r0U z>RxmFod581F6#Etby8&JP7#aZh)ZxMq|GVq{ik-z?TjM&I8~A(8F;XrNBr}oZu$fc zgEvKwyCw0T2txan7c;)h9vR-v$zxq#-&EsO$P-sj-h4if31m=l(d8R(Q25k7r@&Sk zy6~LGJ#1&l{AkV~_5FO6X`ZNznp=0=-fe)^bESaKXT7cIXYL$R_Vktg%CiR6B4<{-6 zRYmeBoQ^=totujKOs@6H6n{N#8(>(CJm>e^Jf7`f(zy0{KuV=+A$_zXmX#gDS|brI z92Sq!UuA@~KphY#qyjc&vbS8Ian5|CPZBTcTGO)mojgg*tvNT%(c*zgc?eWUNk3Sc zD=zFh`KZu)sU^;*`F>5OZjo%b4X+J@CBCB&q6x!Gx`b~DYy z@SG@h&W%g8=Iy%5YUpy8?AyXbX_KL@qz+EAMeT}1Li)Vh$;`}`rH#Vi;JluSpxzh9 z>46J3_qzmM-E5qd*^LxUt`oaQ+y7kCT|UJ!H9Ldm{erib729$3-}_`B z%R;>UMgH2uvzGC;cdd#CWM8@SIoe8UuY<6#0+zW%2|ileDea;j%dcF}ZC7s=n^C)F zt!>&`huTEZCcKd;e6D{x>qXb_jx^ok32Lu?rn#!52n;%if4<6Ly&a~0iYs$($ih?k zY+vbLKojIA7c9UZWjy zSVZx100Ia1qphX0YplcU>Zg+N?79ro;{@G8U!z+4t3&cvda5?R8apw=Jr=$vUA$_c zRZ5yX4?kzxO&e?#S}ALKuYGUbqAuL`dXy{m#r{Wu2al#!R!qwp9YjXe`NzN6XjyRZ zhaiQeobMZpbaD7}EdXb|+S;R>edM!CcVfOB^A8!OUZJLCi<=OS!=#v*n?FQa79cy7 z)dTo^EDBg|nzjHK)xf6^uub0}i!-pB(cPk+_qxuNgyk*Xq>^CPj%wK4WP=8vo zF@Qq(N0G?(1t=PC8od+K%>uqjMb@G)fC25{Ca-SA^fvVLC7SU~yB(py0TOW=U zrjPe2M-|wG^n`uSwd{4>XSlfC6h}4B4IDR)DyMQX|mO{Am#XRYT2pC>KSuFfqT+aUvSHEzP2-t!+pMjEg*1V*_0NvAkOBl+_WpM!v2@R`(Wej!om9v7qbHo! z_QuvV4LM^flmUdt^;M&g>c%rIvHcSflcJ%~8o5qRUES?1Pp~e*GK9^bQKn??q9yf-k`fxH+8x?7 zOC8_au#epQQ6+wXUX%knY~x?Zbm3OAm4&ua+*RrXx*iYEM8uc}nE@Smg;2nEW-6?q`w3p^ z{1&_y-YC6Hwh(PIZ2OSOn`#Dk&@jr60bLI{Vud8zft|&g1U05=G(~Aobvj?)_{kE; zI1VA9Rdqli0ce+ant$|kZ3)LOil`@B53_c)W=bQ+OApmz4cv)RfNm|cbnh4>B%|&Y z`(ur>0MIRrw;mbVuZu1jqYT=&Q^O$p0o+PMVcqo!@a_k9f0vxmHF;nWD|c;fGqTp1 zEgXIM%7bZniWZ-bN&& zqMHnxp)5wJCSU<0t5``+EHr|I*j(0nQXde4sOhH>%!_8!4&`BxS zxA-Ns-nWzKGH6tMRt8jy;l5$kp9`3K-`VDY#Uk@T%=@7*8kIVy19`{AF612DexpqV12opP$a&R3uj`h_(3vUPyX*C`KUu zVx+k_46=xGI1qnUvzwpG@3*w1o??!5&itL4w>+&1r`-PTy9C%Dw*1h88GN@$sf4;=Lfwp(QK-X=-c3n`YQ zPXwl5vX3cDNiys=!nE?2*u^e=K`;+*LHs@xzbnI1d6IkF8ZAmiN zPwe`%raNtITRd8`46e(BE{(pWG_&*HzWAu)9aBf+U+Mv(q_lE9UC9u6L*)MXh;E|e zHr69p{d<2${%~~N?pQ(JKfvf<{L8D+ZRQ%lO9kw`H|XMAXngB)x+f1H(ek6+a)=-4 zQF0)<5_fFi7^dmN`u2+qX3O{Tj2o-e55u)JYLDgnh)&dfHEabJqk7Ag9lYSe1s?*B z4K>aTX;&R|Bg$$PDNb&*KV^zaYZzCVC27tZ3)#OXJWRV~UG$}w)-iaQZ;?jNJ=aO2 zQn_^4MnkgEcVlrupnR2SenW4NV{F<{g{Jwfhmm)Y_CVT!VFDbxqelox3fH zhna3X#`^7xhNKYU^$7)?gjaOAfFgB<(|A~Zl{f+bMofI?M!JY`WE6q(DQa}PyojZ! zPSJhBN!w9;aW-n_!+g9Jt8H`fctUfzkjZ|ajW3a)pC4ayP7@VRorJaR+ZJM!JQ`}+3roo6Zs0xNcuw<+6; zI%mU%84wJes}3ur1HIdo{Z5EOL2Ub4FMM<7%Z68v7$}uiZ=L5_VAKQH)eN~W@05SC z0D+6W#p8t;rp5W~@RTYPj-ZYK_z{CiL|A=p&j~@d${pd&`Vp(_N@C)Ex8$%;?(3MN{#<|f)|(|1XG>=H7jZsz`wgKTAao~81W4W0 z|7`4w6FCiFs*&38jMn*+y3%d9t%yY>Tnf0__uXtd4Ci+B5vs2#?UDWbT2no{fc*oK*hDwB5Z z@LOn|AvcP5VshnNXn4wV){1oQwJA_A9H{t)F4m}%H9whJEAfqqAEOCq%k38`wTYWp zsL2!`s?HNgFOSTv&lDb#_Ks@zT)e2R9a-?0FY4TP2+HJ$LJn7sy_-+*nz$E*RIB`8 zU1(F(<@mv2bUoY2QO`!J%cNZxO1l`vv9Mu>EJAu4=?>7&r!}nK6OIlMcs-vh+@_JS zI?-J`X9Sw9b<>v4G6RR4yY5@7%3sEoOTD!NX^XAGeRIs374QeG&3`p-OCurYsJQy? zHv8ShYhP!7l;}usNc<%%p@c2PljUor$zlG|PlqKOIdPw86pROs7x_MI;uGK~A1@!F z*(Y=H9N$r?oV%-)UNEA>{Xxx9tB7h}mdVzPy200$ZQ;5@LSx`~HvAe*bFnSkL9Z)U zWq}LnNw(5}5p|WIzLk~LUhTWHKqC@(ppjE4KastmknYY z2QBKO5ss+AVGj|hj=-wG@uzA*Cf~@kb=#M�=e;`&n`N1*wJ%!1JeN7)xZFl)9sg0^r zk0UqI^cRWMC|92C_5N{msqHYhuqX*EqN_`@+95=HMs`xMHgR^>E>3%M{f^yeO3-R* zZrkIhp>U(Ph4+uMmSb{{A9-7%Tsu?><{WnN!Y|Aks9G!Qn>qy-c4m*a%O5|5oAB}P z$85iF+OQBSy*5$ovNIxWlD0%^BYo3`Epj**RhuFo-~QQwd!F%PwEoff z?T;tTJ~zjon5L?bjqhyQoS(6kNA4E|Ikjf4k*t??9ArIfJtONfIe6UFEE*~5(Vgq)WN|AJDtjpH6HVt6ian+Yw6>=OqeyO&||x_FX`lf1U2TfF$H~JJ8>R+hHJ^Q zo31C>IIHiP!1QXH=^rx6DP0zUOdw@i=uhkkQJ}RPVJEA>bjwy35p|;k77}RxplRH00B-Iffof z5VbIu6|Y>-?-Em=dW2SB*V0R@Li~Wwe^i*+sVgu$EW-y($UpAp%;UX zW%6E#K>74Av(`2GzMT6M!{Z7F#mf@%w-l$AMYBYan1$5;=dpCPhcfM!gpmzj$-?`4s9QqI#1? zagi+PLL9ct{pgYvXH1A!J3c4Tqd$tGka`^Ze$D!`_BAwSk|3JS7)_q&G#-XPfRx@; z9Yr`Cbvu?f)(*-+$k6&3<(;j@6XHlR*%MqhTFgq)t;t){wb)&Xdu8?e0=u2}mlRg` zfwd@P!G(7;E~s=&FYNI8qV z1?oLCvU`E;hMM>yZ&v|U7va{P;@M3k)?eJH^pB5J>b! z;$}2;Mel)JfsfT5jTR=n{v^-t@#NkioM=fGPq>zzQvjqpgKT`eDtlwbl#A8<3@v+(tqj@LuUU<-O~KUjWR5wsp>-#CSV|4wMlV*raw*Dwfx0XF|4mRZsX%~giAO;TcP#03}*3He3FRN z)f$i9BUQQ$`2VI3l7zG6pW8iYVoN9!jR7(s1(Upd>$h=sc6Qfow=YLcszLnx$_pd| zISBb8P#Gqgl3QB!K%)=5Xt5#8e7p8Mu*Ka%(+!z}nr z*cjCzaZRb&li$aV+*)hUBxJ!-Qw!;@LumdIfVC&ZnF2;@AfgKAFsO5Jb+tTTJ6ay% zTkml2fj*T6WOWNsS9@bi*U#AUWkMModu6_F1wdWel;K$PxJHj2=`W9~-nvg}_6ya6O`_wZ>Nx9<8ID*PSn2dz*_P6E{!U_#&=muwAKZnwhg& zM;>|wE9(FH&I17g3&`mKXmsd%ltGgPk<}B*&$x0}K<5TS5n_92kGf7Drei73+GP z-9?5DXp^s8n0vaZD>v2xn3^9&esYh$YSFMOa5<#)8}KiIfIe@bU*xWhw1jS5p-TaQ z*cmy;>IGBy=RX$pByZpNBcS{mhs0CUe0{V>}Gh@#+@7zBaAOAd?;XnHEb7)56F*aRW^KpJ8nunc6Bfna4|HOd57|3=j zxdq#x=aGK1{yTq;A+qu32dtYZ@$=CMC;|L(Yv+W)g4#IInv#vU-mQso3;7E8^7OPl z$=1R4;89gbPRJT`HJTN@`S_dZ^=(eL7H#(AiTM&NAK(a$DMH{m_AR#6 z(dfF@pIrKrEQlVqfVmE7g{!^Rt_q#0l@;6Q)KqsH*OL-lngVTcne*>+>Zd?CN+il?cdt@>ue<0d;9{*F-vXy zq{a@(kG|M4V%CuY3Wz^je4U;;4_|GSKA-k*+-;H|Ub)fZzOY03opg<&x z%WY}V?wP$A!C?G8qwEhM8Mq$J2>6Pci}s*;jv~0`9~(k+Ktj^6t48Ok?C8k_n58@XT&GZW_I=Zf?WiGi%K9<#tH=3w@d?@U6(`gO z-xIk|!ady>Q*TUdkF0LoR$D^*NUQa@whJfe`eSZELW z7xj?jVKCKzYvK52l!~`@WQ9Do=BUn%HJ^r5t*mP~SN(J=U1Rd^b*sUx>NG9R!|Tr- zcRK8JAL~YoZNy)8!S!!|gz}GBZgzR+4ss36r*z5MJR0S*MO$pDTH!BD!087l>c+7@ z9xJVy8ueyusi8x=rL_rm7d+EH<|cAsFwi*H);c0+<+*BI?q{g$Mc`Dehx}Q9&z#rJ zvx`PW3gb!na=4O7sH!+q;kV#!@%lKgL%sT_nyp}@H3??z#c)OeWBv9gtY`t&YMxFA?g@onr?X_#Rcx@gyC^%{2u#PJQX}(d}~6Dci~N%2C?QnwlKizE9&j%LQLiw`yt^ zS?~xlSrF;-igoV&HHl^(1F2VPrFcEa>>^O7G--T~umz`3Z&v{Z>BjjiEXZHBH_?3g zVFr2vjk-(F#j0yu1xSizP8GlH>cl`hya%P9K1-@Ks>wdC7+8UyZ;wO!B$`7tFTP$y zUC%q$eWVOH3Qsz1pKB!3YZIf`{rYNmbg_2R8@dw(uVHcD>m4rm8exrxDV*eeP@-|< zPkTDkflrWM?z?N}!!aJAP``Cmd^KHn&K-d&Sw7u`)p@0Su54pg5Yhj@*XHRo1?AFB@PDF5=BFc z-#!fiKK7*9xcNa6J%L@}C?t@y`am#X*RL{RsRcpR<&2(T6eFItJ6xM|nXbys7^ez) zTRB&@uB36nw34q)dGX2Tur813IbTXD`NaA6tL=I-c@R{;lCLv~eh~4~X_cgmr1RNS z1j;jt&?|@`q#bw1j z``K*m;rDr!YvX|q&R)R6GWNkb`p?bvVuYo+G+<#kL4=reRYUV7Kn8y!2I#;Mi zutNzO$C`7E7o{`fQ9-99n_y=d+JLLaV&j8dN?*vZjHtlTn7BiC?hvnjEMHp)K6~F5 zut}RAAb!-U#8`iELQ7rLvoa$Wy`%Ci|FlglvHis)u9x_3M-o#shn}L(afa{LPOH`= zkClYc6+29+vf~;KmVE8u_{q}+=3-d6s_RsR1KZhz%p5_d*7;%Q)ag`lGD>T5=b;{< z@$TU?man;Ucq3X&b-JB4qv4~h;8Ns1=D{9 zz7ARRhe=K^t|Cc#Li|?M%VSp1U4jp_wY7Hv8>DaWUu%WbXn=6@oAlwJX54$MOFHr& zR0_5HhP(#%H}iOx@&JXZ{A^1(j7La_T8P&g$TW9l&zh-tlhu*+Td!@7?OX6OhO@0_3eL|JLE)$Fqz=uA-DkAbOKd1|p) z`mQyH)H9FUw2`_6hjy#>2orcoXAMS?Vz0{;YR!r5#&(+my&|VB&NmD2Tyu1Q2BHf( zvUR85n8hL^Pxg{dtK;>dNq1Db8O4S~qFl5SoUIcER7Ev~HPvU3_~QDQ==KQWEaMu+ zqyK%_69K`oPs_X6%vQLr|IQJsUS%lT7SrVP&hE%$$CCM5@ZPyuJ+lT|H7O(^io|Om zwPT;$hB5CjU(-)JU2fK0Ui5`0^_Py6Yi3>XEIclT380R?RBb2o6^HL0&nur%`+7}d zOeGyby!SG9g|Y-L+`S9$UU6A?!r1?9A_X5Q&((d6)G!sA%f@?jmiZB?Q5%|6_g;97#-)%BqK1wa12i7fi z&*eUbR;9CnP-Z)~21YfNg%z#z1_^~xE+ts0BZA`EcVTuG0Gn^?B5Sj;uOe>Y>J_Y` z<5uRWSg$2X@%P+^hg>?@(4Gr5pZQ^Fqk*E%s>Urf-~DK*^v2CFnql2Wm)=Tu$aXL8 z$#;~bt3}z_ZLy)=rp*BBUA9|>h2zaS<1|jxQ06k}(eg<5&uBpMLKK}$!4w;snZtrJ z|M-aHw;TLfs=6Wt%dy@S^wKOGTkqn1=a0ni5O~6qgt?b}i_qT#ccJs6;nrrm-g)&~ zWRJ>J%Gr2fpkQv|0Xdhdru|;8hI_6qdxw$-S?;<{Nj-I!u2~g z?i5*3gd2X_1dhDfqPj_J?N->(1uH|5ycR~pH&l^KX(w$ zeLG{;LZmA@zamd4BmYXZi8x!$`Xfi|y?3w|$cgSgwrK8~zbkbAKvRB^#QL(WUM-~e zvv9M&mFMN<-K9%?`g~JgE+V!YAbXk?1k3Oyl6QyU=N^Qm(clMOmsjE-@;8=r67~c|61)IYuW;N~prx zKU*o_mQC9FA&%D~t=d|-@12`dqDttzO(;g0ozE`u^5a8Pu=TUrre}#xhqY_(_c!!9 zIP|*Cbea&w4)|3!4f|HSy`k|;tr|=VALHt@65~{3SEcZrLY=~WyOP9xzq0`P$dW{! z%%!^eN{b|VX~`Rs)zf;c)q62(4Jd7@MJLJ%y%QoE6~k6)?r|9SVn?f$u{`8ZoxgTp zC&!j~y4Lc05SCvg(X_LdN#;%Thj{tG`%z5 z&LD`nupo;|#g>^#H~IPV0fF!T$Jcv?HMwok!h#}*iUkp+yJ-RfqDXIUl?VukG$U0Z z^n@CED5A0vA#|i8Rayc85<;jdU_d&dBs2-V6Kbe8xVPu*bHC?)zkrAKzH7}jW*c*? z9a|lxm=|!p#Ln7kllzSmR(RpL5|Yhc>+3G&Cp=vH8AJ0B{G58wt8v3k7Mu6+(-WD6 z;4af|59prb9T_%m!NV{-bH-gpPjUe8y#!@iEP!D3#CGPG2KzF;*}S%!rOlzc3&Olc z?o;f5eO~xM0dBW|B100z1 zMA=Sa%AaSkmFx7+EPbq3g<1O=-qtUwOEdR(<$B{~rcY&5-T4~te$dFfk5vB}r<`sE zUp2Zc#-LjCQN(4@5=&nj`|-o+TOxmj(O2m(w~F$WY{!e8Km-+?tW7AO`5@z-`5BGi z897PDPemT;L!#G9;yhyxKpLAiIcgiXG}bZZ?d^0^Su9k|V5j_v;VVH#F~b5XJBW7u3zidCYS|DCOuvt7f^I|6nR`7`MU43Pg*|+@@m6dR=0c33S8pdmCz7RS zyv^upro7T{=LZF)A7iqdA@_hMkM5~A1UFx+Fn@^Mkmq~emko&5)ntArW;vVrQ((x4 z#PyfNLLd#ZvmM;88|E-DYOsAPVIDRyU-{(8hjyy0n#!AEjud4o4uYO5eIGpr5I4E< zLmVN4hLAojiC%9LP0YCR1P;FrUlP@2o>d1%*eVwBI3C6$m1O7dEGgc(tR(skJ9lX* zHpVGzeLI$a(lKzxgQj{WaJ^jvR#>-^tZ$gIA*tB4BtQF|176?6hatUkAwgTMYN4Kj zw*RcpXKl4m>Hko^%>-zJEE2xwQ*JyB3G#$6;D+#y4Syb2%8u22yn>+sH*BW5{BDBSg8`-~V+EA0{S5!i=TLnsx4bps%nyYk}yX z_(&`<`5NvFx6~~6rhHXFwL)X-O|qp!pR5*exMF_i*c+kZ8<`<3LAVXE6Get&AgiWo zJLCBCYtx~qhCsvmLfc|K8}f_u>z{TF^2`eFcD)OSr{oQwg=fWT6sIIUyjA0-H`OS5aOQvTy7fRq+Oe-TQqUTm>GPs6!+roH?ZEER&)Q0--06>($}W#3O2nJ2G6&h-=pBzTaUOQNfGw6&sj*SLpY+()du zTZ~DCJnRDOCB2^WRS0rE@li&Hbx9??c^EWjj77E>(~oNVhjh^e8(mrj=ByiC50(1; zM#udl+SsyAd+3b2&GEqZ@z*ElloR&Pc8-ai!nUw_m73g#OvJ!G4rgO*Cq|*A(GOQ^l5Al39Yo&p<2mnBy+|5 z+KeI1(u2H%U)nB2H5L~y-+i!#_Y5wM;zeun$&1e?8*G+!T$|;zYO&H83PN6 z482Pe$FHlPrdEWlU)_gLDighIQCr>(ApW7D(Z|cG`Tlo@en+apo1ekUG}7 zp|ZW{h$lGW@Iy3;P=r2UXa8->Uw#2VuJYqTPzwwi?W@M7s)e2t^Mb zzbI4E3W8o8^4Sr2UTip0PZ6t6XNV^&mgP#Syxz+X+!eOr$SO#=wxP35OT>B?YsYsx zKAaq{3x!P!s)vmYXe{+Tk39UU;Q?itlM+ z5dBD}ikIwlRa8y2Z5`_A5|!-UnLes%e$-&&HaB5IvE*=4NmCb+Nblsemk_~=$z`j3 z2$5#?Us-_&*m@^Y8t5aRERPvls zYC^Z|K0D|-v76Q|Ee=)-yztyH2p{!t1Am^{Fk9yKrs-;E*y-N@Z=^k5G^e?&y1UUn zId-&J@`C0!FtH3(n3xrxwxr3>qi*VCec%dqa&HtHTXV413mjKGjT1zweJ~PGxSg%Q zSZ4@PFspIq;-KQ#w~~!iF?Ghs5*q^cB*Z-8#eijr0fAf|f5qEiwT4u%`=WY!W_&(^ z6W|57WshQhGjB%Po$Vfk#_oTzRveK$?ZIYg26!KAt=c?{T}+exzCSHnZ!ev8O^w&E z|K{unQKxM>8zzqU1d=ryns#>2UL5~}NWVJ! zKSNc5m&TP~|DELsYKUR7dxW7M2|SebiYb!4kQeDD1e3-Z2^>@T>lB3iLia&q2o$i&>`BYU?(%*`flNWoE3fEN+Hh;Gsq?oBX`gl5Sk_aUO|z&oUF+**%~^mg6$NYcj{8<}u_FaPp) zR{|-qf%4twBDURCx7&sQ>+aPabJ5+}^iXeF9iio}$~0J+61?HSOKE3>g!`OZ){O~= zO^Yv7U5BTM?#V6qI(I6no9ta$g%sJA{;yU64J>^B%=DdltSj_g`giO(dx?&rIzz+Q z-SLrEn%Z`5xCt<74=@Uo@aZIZ#sGT3wHY|kCd=@X!z|zAW=oV>4d;?EhoL=&Gg1eW z$of?OpKr-Kbs=CiRNaB?dn8?> z-dDZ*!76yU+lXz+mP`k|Iz9bAdTZ%jOI^`}de8wb^>Vt%j%;-`djwkWj6w89Q?Yr(Zpf>%CTo|2dbh%Hhi``MxaHIcwFk7hB&g)m0+( zo}>*YorcwX#ho{A_|KX8&r(SR#)Y(PnPBNbur4@(H_9?jJ#}}`_R{YfA zDgD_sn;pwlD8t6DynAIz&#SI{eUW3(Xvh$Gz(PYT)1p{j*hY`t+fp7qZsQ-QIip0i7sUW>?;DS5o5X+PKR&2plpY=g%hFKSt&R^v~dj9uXv z5;C)(wz_8n`ifRURW?}l9`S>Qg#19EF_46hp}NvSs2FDni#mTNh%n5J(!GaT7_xOI z{o%aV?>`a2*AzdO97;b(Fm0#OYpt6H`G6TyD0671gTS@<`f<*=6D5d_L&0IY|@LxKjK{__zlh#-DWhcL(D) z@>j$+Y@D?RS90P0W0$gYCG&?r+`VEYb$n83>wvM>K3gm|yQKDmRwU~;40cW&u}RaO zLE@b6QtJb#(UUEKOWs|$Iaix*ufs1!6tb$f43#&~16zd4)@Ghi-t5ECh{P|}7UHWC z^jx~(w~^~|HVj?iD{*MVD+#Q$iZO>;MqZ26W5h=Zv+6kF(-l~sZ$^2{G%)_khEXPk zR#A6Czrx%UR;j*}$&&GEtg1xn$`yi~E5SQ1IxX2RZB1bBZ2fBa5HtA>%ps71_p*WS zA(+4EQ1mK_3X4|~>3f*R#o`$v!eRMTbBaxFON1j^<)(?TS;l0|y5ZpNVT(a%rFG5& zPlF{@UxzQpiKlQt%S2Up0(5_K;g2i{UVZK>a4_tqa}Fyx(Ti6JD8+~!kdWAWZS0X< z{q>Zm*Sg~6sg;tFsTy+bS8re2-4Wb23T7Hu+>t6jf}=`(BA;3wA!*mZX+;2v^ojv_NO3Y)oH^j3VEs{sOI(#l7asx!S*YEO9XZg{rXzM=LqO$L6I z?WrHz-N)I4b?7pa_KiuF{MPdQZao#%Fr5ap0O>dto-JX+Sr#26zM1p|-T zLs;SL&i)OVt7RLP=Gl9_p!It=bnMYj#%0|xQtkB@#uf(Sw@+UvqW@HcDW+b6rGXP1 z=N_prd(W4DFyPX%x^>5V&E}m7Ri>lleKcHt;`vocF$$t5`_NE?cq7U1_TjFqg=Tu- znww3JS3X={H6+8)%x1iqH6aW8r82wFHPcwb%fbFWqE_3 zL8M^k8FbUqsJZ7M|Iw@|Wo&=Ow&(3>D)|SdH>G1iLTK0Ga9op8fAljVIxOR8qy;b~ zadr9(1?rl>@@`gJcwYsK{RbgKCl8-V=WTacE&WZYm zKX#m01T{Q6b5lC&;+dCSxvwD6Zhke7gkCcLI=4R_1@Z5i4$jg#g~e}!cwQdb2D&R^ zQxVGu%@>3N*vha5Pi@tHK0x#u`8XU!iS-)~X(*pc=5Hu@-(L2xSK-X;`xr?7TG0)s za5z$Bw7O1Lo^D0OCl>qbAJejdE=Bse+KFUiw?t~Yz*=FM_3;R5~H4KHQK?6zk1$B>jjFm5{WP`L*Nu~ zErn@xJ6tj$KyP$ouwXLdX!azY#5*Pd&Od%OHT_c_N(6vz=Tn5i%*T1Mi@s;^vkP=9 zhCMd7{_}9R8R`0&+v!Te&8i^8M@Vs}yL;PTnDmZQ9Ls}K>Rz%Rm=`c-&L*g<$>vPU z=0YUE=dL`+)S&NuJ>vU@>Nkq+os3G7U@U37f)k&P=|!u&SmF+m1}PW;h2I@bcp>|F zC~ut?<68?4f9|1U#&)_wa+eH_+q=)>)I9A)PE~O)qRqLj_L6DLIYRl5-+^IpBHbKM;polw?R#!Bn|^-`y`-7W zgIDQg{xM=3{-}}0Z@g4JGimYaE+Nf`*zbi#*ll55}t6rL9*G>?;^MnnpBf z)EZ>^y9{s~Rov#HH&x`0L?(&+daRA`hHf(d-aQc;eSxU@dXviG@u${bJ^h%96$i}{ z-@G2C^_|(m3cXI$AP}IogL2*~$?nkO=6`1UDX!8j?FVae3uJK0lEpXl)X100zYI72 zt0)s9+`v%L7|3wqcWXrr!{oBtFCN%#4?aAEK&K^|nOaN-S_OZ)Wg85(o)hsBsCB2TO)+@8B}IqSS3iRHVT z@kR9vTFYVa(#$djgPHl`ZDb!O!S30s~}ebx52rk%&g&90-;Wy@uNCL z=Y7`!!^3!5pM8t93O1u-lM|U#_wqZ9%*FC*&s%ZM+mz5Bz^w6S1MaN(7xm*;i$CR0 zzg+Vy>R+i%@IBV?58!r|=Tob4e(@F8kCCCl^31H>bjIN`p4pp!dhc1~$Ar%ecu~jb zu9Da=;ZxV%bd!G&5O5<=1v8v$Zxh?(t=m94>vyzmC( zgLwUFM=F&9g&q$SH~&|CK$lR zA;~~rnQBY_twkMF{IsII&TBOi(+*8zz_a$)+u?a{*w2U$pC60>O9~On%|EROPMA zB`f5VKreyD+5RmCa-GghX5{v%nVgFIw1?Zzy0Zr<0(a+c1xqRg zA~$MBw)2as$JWuCV>y>XZW>0B9lqY=bzZFdc-jSFic2@ex$>W#->DrnRA5KWs8=>! z&!VIG0Jh)(gz5G3(4)8Ozy=ch9`C(3uS3QV@9CDGd-^5PNTp_!tLS5IA$LE>a{G(` z4B0RR+g=T$+Oy3TA#04PWA^&C3KsPqHMOv%J^jBF?w+PJnC}YYdWXbFa4noRfT1y% z*t?4PjtjIo$~L~$iTDFrZk%3o{l&`Cp=ABu;+=P?z?qzeun^L&_(CRT>u$VZJ{-hT zzw!9$Y64v+?0^$bO5fQ^0)IMreDqZ=cg-&wBTd`$@=`)J9ub`fg^#KJ;+t z`Pkb#bSo0xXE=|~lELN2D+&ZF^(tyb%+nQ)=Y3;}i8{$$%eA#MUrz>bF97O!iax_b zMq+n+|CSB=4Q@Yl)$Xt0<`&*SMyuj)G9hw|**^Ope!)`NdRghuUf$_rZyO^W^m(P4 zFZkgWblUDi@GYp2VT3VhZbN+~l&4D7XUTzQP|w8{3zr$tJSkCuXFOK_sy(%l{B_2n zM(NHfiN{CotmmFw;jkE1ro9ls!1E8#Ofae$xOaDA^}FZeHLqFHGhe`d-P7|-GO1NK<`d`jhm zYL{ZZ9SNi$kjG!z`0~#FIxZJ}Cy>lf!h5OInGHD;&NF{DU@$a59(=y`Muhlztaeu^ z`LtVV-OlisFlIT9abLXLyQ3-b9}~Jt$ACN9(&`k%z$4$nm3A!A)h6V*s(HtE0Tt97 z@*2v?G}IgshX_%yIyNLbnzY}~h>3^SPrXn(o(ab-jv)#T;-AL3q?YDVzT{y#TEPIQ) zEB&;N8%yM#7n1%Sb>_BwqDtO2t+Y4zxIIwwq;lI?U&ohc!nM>(MQ96C^2gJ2o?~8z z{*>kx2~f*ZZUFS$rWB#Y5ImsFgKwHr-()l zGy{@!nGxS)XpsngHyEL7B@6Vu4O3mh(55nsj2dQ!I$1yl7d~Gz3$C$|A#Pln?{ho zcgBsTulGxEXtTA_iHF5=uPiJrgNyaQOH7MZb(v^>xH_{~_^!x+d<~C}&g+IdXH2m^ zQT}UhM_3dU81Ky0jb7mTwzjpvQ}eM^2tQC4U60vXnA>x-@Sh?~ zt^p)-;mUz_t&gRDrDa02N}>g|pD^j>TjNfYk5#39MTp(0g^; z1jMWBKDQr8*t|ijaD=cY(N6hoC;DB#hh2VyV{|rSob<-X=|+h5Y~7yZhtW%eDQE6n z`t6sGZDr$4P{+WpN$y-R?ReT9JTtXvb?Yg859r0Q%M{zdl=`j0$5j#VJBbl2#aEV~ zDQ=9w)Ru-1B*T2Q$O$P?RZ3S#7`^_V(8Z3g-uyH67= zO*H4j#hTyL@7bd}fHZXIFn#^EnRU6-({e?M7Sg)BCcZBDFkB%tscR zj98TrDp{`cuB*KdWYFh&E<#)eGul>wfs8NX#O$(#Gesn_E#s|uG%dtjEI5V_o2QIp z5-s+O@-%~2j&W6`U)d4C7afq|_%bXVZRCGzwRktmS(R?2B1gc2yt1|DsCV{4FHA{m zj;^UC=@ArxsvjLVcsHP$I_$OVPbYC2q7t`cVVU zK5O=ufRqFMLG>j^!p5}=^d&bky}T6dE1CXhWagqBt&BB0s^M%zh$vFX81V65qj$&I z20<&&6jO<9?CGl1o9lN;DH8jSRPG%n&n1 z3XfHL@?&W|%I8j_G5)pFoLtlQ(gJAKf5bG~J++;tny9qX0K*^HBgoku_nKE0`#l3p zH_H03@Z3kDW#%B^?RC$AW3RY=ZR^iWmf$t1!bs^mX2N`dNNT9@z$IGCiKsTdJmQyh zJ-;X1rUrvM&(rv?n2L_+iLi?h%quUW&tqMkXCr;|>={}xKznP8I#giRS5Aw1!aQL_ z_?(1+T-1_fIELG$+u;!dw4wwR^IG9Wx3{+(LVtWOdc*!KT!rG|CJW=Dj*bRw6$Xze zZ+5?TDY5xcZCkAr>0sIsqWa=~K-92SV2X1NnaASik}Iqm%4O;{CKu`9r+DF^z2XH; zk;n0!_n-N+h8QP1BV<<;dA?HQ!@q8HlU`7lz1Dpru@OBV;n7IT_9adWxmyT&9ayEc0+f4iZTl=DQJ_UFcyyp{U%Cbe-BRbuHK5fm)a;gb zL*K)DVvk(c(Y6^`e>b6Aha|@9(2xorz_~n`EvgP3p z9Z#no*FX1Q!$(VBhnLAMT0=9P8k{Xp-h<6Xg^nexUn-tzW!hO$19CzS1cS!*7EEkp znCKN=Zdz4z(+cAv9L*Wc zJtq*|44;0sD?O=Cgl|Mb7`!$Vfd}z%iqE-1# zY?zU@YC)Qc@lZk*EJh!7h^l+UO_YcPB@Iq1%vA)ykus#&VHh}M_c>y4dhKp-_K^Ym zkhkf8*g(r}ykdd522_F_LqKD^gd7vWXvu|Zhx)aPPoK$0T1gFv!0(_t-I|op>@Cqw zu7}G=*NK{}blTYTfQa!k7?W}I0TyDmsd#W*Tz zV5h}u+>`cqEcXm%;81ytj`Oy-^U7J3UG-$@f3_i*nV!Y72J^|`Ld89YminW{;Hr;v~e?ONE~gxs6iC>83koBfzarb1wGiZu?m~A2$|+oyP^G;ofz$0H z%gpGhv|^p}7H3uQg`;4iQAQR^(!5o;Tv9x=puX!l+;jc&z?7ZtEfd`z1PH9GP78$i z?1?uSWTuH-HV2-Zf6U(trKgU~Av)>zZrt{!_q9%rl}zwS;rexis&&V$c=xVGjnKVn54V;<>AwHmLtd^sTAnt9a>XED|Cf|!)Xb}88usmw~l&_Hy=y{fDQIy@qBmW0Yw|pdA z&)u^lksqsPTw6Sz|JUm9jdL_P03&mhbTLH;xp(N276ozSc=ke8z=9q^D(3jyjCmXTnRoxHQ6aqTA22BQIQMGtNE zv$aIab96!z3~O8Gvt>x1eFZDAy5_M*lGM+gGB4mc!w6GfC7nL98yCrncoi(B(F`ur z0TVsb4zhObLeTNAaC;h?n$u zpfgZZtm4YGQ1+y+K!z{_wkb&Ar5srJQp!+AfOVR(Mba7W1KWdTPEuMM9{W=0(rB0gWdzMp%Q*rIs zJ&}QHJe>!E`PFiP;1*Ae&$-Tf^D*hKP%KjtTFs!D9!WEv0kVdXmB`$?69WL)a<;+( zULkJZ)=zL|Qeo4}4M%$KbFR7?K64qYpR%bL+BVNSCCJ`Vnjxrf4OXUUyl{XRH#P!~kXRf^RP(clC$3NR)Czy)+Z!25UOLD8?x`(vFK zj>_77ZYvSxAhFg6eJD#mR-)s^>2j0~;NVt~sX}oJ*`Bmj#07OA9iXtbCRWOT6#ldr zkuAxsUUZ8=An6kd6zCg0mf^DJiaJ5hFC7^ z9<&OvA3zI)7vjaR_2%_U3Ir4LsT!4mq?YnwS;zQ?qRv>?Qmum=fKLUfIvcE~LtzP+ zvYM&pK4G9bKvIE!lG%o)c$n0?w>^h1R4$?131L`t+OQ;~r~{2PgPG&f zbr5v%3LxB~YhY!84$1Xa8+%JUviH(mpNLPMZJ*)u#UyM?X{wtb#?POCtzRe}1GmQa zL4Y$pZ99sSYo;rv6ixKNftj;BCOolOpHdZ58S%&mj6;H58jPxy0;dwn1HfA9co^8C zSYJL88L<0Vhj$?=bx$i}URk$oP^L`-ZXHg>nPZ>1?ib%V=-jOY1E@%qs!nt$NU^g_y&etZhw)WhLPt9{bT>Qu;!dx}+ffby6BJ zGXF2|p~*Fv*Px6Yx^o#@#00e%e4iGacyg}iThIqm-|~s*g^9lKgK!e_RS~t^-Om;Z z+r5F+XpTMW%{}I<>Fm$ChkC0bNPSZ+R{$7i2GuFl_p300YWk48brPZ@L>ICJQk=>| z{D}SjPRLe>{xLyd$yG2PrDBs^W{9rg|?>na1sh`);)qd?yg>x|<)i>Vx zu`*odJO6lN$dxH1j9K@JfJ;25#OUoutMuk{f}^2XE2kUz@~(gBTZ4UP5KPWoNuqoD z{nW!u#T>;48R?-ln`Vwj{f8?GmHp;^ax;rRE$}Lao5&-B`a(lLuLI+fw~(QC#%>dz zbQG(1)G)q=?{v{O!`V4Ng7HmS5;fl*L~4s^nFYMywYC6%e~cmUkF37Dz?}g1q^c*$ z2+m#=re<-eEQHP)@k)C%`+v_PT93^K8I%7+So`E?f%}S{nVH1@_}ojn3+_{f9&ynD zGOyDwRaRZEcbBJ+4B{VWlKX#rX!y<-<(o<^D`J03Ms-#Q(5o=-cG+@%4{Pf?v%4Bb z63~)>$#@HLpip6vTlq|PKs}%By2Jg~UU8PbODLZV^sewtbyD2bVBW$zQ|Z#~XsEnr zOeQuVWf;EX@B-tT!fQNYi_STVUS#6_5m&5=663(`o>GN+(-t922Co4pmT*Zgw&}X< z#~~-Xia%_G37N0m&;QXPH2Jtoq$mwFKlxoOEb{ziFiOm)yTAlk*`n|e&_T>DuA?0i z{_6Ir3(_7Y0o0T2LioQT&BTPw?IxnnVHXEwtZ%7;NW9cnWe-nR@4UJ`k9s7*-KEj` z2;;6Hjn@ku3z!V|D9~O{s-AQU8R8Y==kV7QU3d}ALZub-%g#ZGI}OgUyH@tS2Fnkv zl6{w9&{MQkZMfv}|NBAxTyP)xM+RjUPormW(=)hF1nF`Om)(hD^LRh?Fi~l#>+||0 zRUgyNKee#eiQj~klin+_?S`Wc!~*On!NWrIRtkxNr-34|F^>`q^&L61ht>|_E0o%)GzT6_zx zz`JI`WBte&2%U@8cB^bGvbLXy{@^{~`Z|)!1JBPd-~FBX7`dTgzLcPH9u$~EmHvR* z?*7+G2Gd;{L@(7fwB()V)H(0iUyK%zRHTn=xN_t6-EzfYZ*u$31QW;wP8lUBr_uch zk@;FL!lhYmIz&Y@WOLSHgVAB2ndgq#It}c$_sLhWOpEgYyMg6a!m6 zSMihe3{bE&tHq7kw=H2uK&<|Ai$ZQV&C)Mm#lN0~H>*`=0y%pGfy6$E>)v;ktbI@( zOnp|#vskroz*O7{Wng+XN8+GgvO8BFTo~IXOxzi)D?<1%s!9*N2lql^J;H@J6C}7N z^dwa6*x{2-DLZ&VBzKHkYfpCB)jEfAdC+F>bj&4f1yE=;@!^9+ZHgX>!pKvpm%!6q zTH+d%=Q(t#{S`mYhDbqLwe@#Hvbv$UO3=1IWZ^dc-`jG7&oTt)T52*sbVYS0 zwfrk~MnzwsQKiS|+UJIyVi&9C3opdW z!Y}QNmj2Xs=c%ZmX-^AJf3F-cH}-u3yXKx_Aoqn~j~kE~*qgV$OWC5~>&Ua^7%hqA zJ;kUoGh@@WjvR?*J@U6}WxR_6rf1^Wxq`zUSdvbbkB!y(&v__!DDfCDgvOCPIe(xw zs)KYyGX;PQ30JuZK3|^S7n|ydM)9lK>s$VyX!KL?b;Qp_W?-2_m8sSDye~}O37OB8 zDRoVXfhzTcH2QnQw<@%Fcz*tM3reewclFs&Z;FM5Du4vVghyuC;BGq;4>>(5LAj`N zNCFt7ukeUJ5r%JzoDu<}Z$}5LJ;i|Lzx2$j3?!y{)H32X%muWuiyRZ~e zcS@z#vd#o@TsxRZMFPzT2@t1ljfKI?J^^Oszf=KL@D_J^f8cJ~* zeYxTL-}#re*|h3+-01ipnC*&a$%0}^^o`WC(4a)L9FPMVeO|9v?1lZZy>ysLyceeLRd z3x%JF=PQBSF`;=Xf0la?a`k<`e`>0ky?gqy+t2g6rnLb@rda)-S56TY+yje5G|kQ3xu$Os+}PdmHu7xwG5^;tvR z*zgkzL6_YdGB)_6)fIUK_yC<%b?m+ehpoo%Q&8`r)6?CjhrMli_d4zUIJ=~s)?a*M z^w&cTRNAG}^f>2S1Opxnjp=5|;+<>C4$`i}Y3DzEL`tVmnHEwsO#Tt&n?v0t3*Q|d z0bc=^GMW?E_%654>z!aqJ|Cdh57=kEtbyEVVjM}ZQW<)R@O10WiLIV>U_H;n`8kmX zFN$&Ndi=e>3>%NBs47;FKO?_^PfQ)$eZc9j@Kco26#~^z$1ZBrTMK4;cGC(1I|jnS zVUmKnobF#*(bWbM3EZn4e}pbk|2QyeCPr}&T@5r%{E(}aAafZygg;ZF8e@oyFGo%A z4+uAjsZ`<<=40L>P{d3M35{ zc4a#Ue$4&^n>hMY^@eC7k6ZUGR~1f06`6fMK+vZx>(iEe^>88=3Z8~#NwcJ<@^~tw=XH*PD{e=(h>ll zUU21os`1^NUu~8(Qe{qBoAKa~y*E2~-95xhk!0B_ZFx@|HeC9%&t)p!xwR!lD$?2i z<5jNal8jT6GGK-Jm9+`It@PaoK`DPeX9Y@P1Jng;cL~Uw150`oM}yV5>Ck>vhH9$B z;-<}MSZJ$}|H(@WMa^NTKqRwBCwDOZ!Fc0H6WN)ZUZM(d!>qCSYcRM!lP*(^^;6N zYnL1NHE1P9m8puaVgZweS1(`*tcgEEg{s}=^pyKqF>PKu1f$*F$gk6$*82WN&AZw_ zr7{?SAfcM81>KpCnT-ER(q>$mh=!Qtr)qwAP%|m|l{c|qp=_i~rA-W&(gGSv;Moml z5!|%!cZlUC8M=tqe|`;Kse zee$K%I$CJ)cSLLzs78P?~2d6u+Ey!#s>0VpuAD| zep-oh-|0Ik!do$4sA55XM7%le#C2@`+WjEbo&$l`ZkSi;d1{hb|=1wJ?86B&AH8;6( zaxkdl)gPZ1Oubz8`i`N^P|DYWXz)rmtQow@4HwS%x2y=Jk#=5=Nt?3cQ zNhv41wNPr!sIaU~q)ix;JF`7U1Y=yp<^sM-Qc7m%r!TlwnDE5sj+G`pTDys<>3S%P0q4Gccw;)2Wwu9Sh!U3m zG*Ljq7UlZ-FP_!(%*GAGKUv6=ll1$a*I7q7CyU)_LtTdBpV~+)*gV^1y$$T)=47ftKr}$UOk#DWe;=MVz z7;pD|PYhSE&!3hnjYCJH8GYMc`Fu-)fAO2UaWch>idGZxWWHyb7yMIiKH@!-ZXoaM zR2i--y%t+kj5dHAl&bu}iP13kpnTL-lEEm&gu%*{z8CVWYnanwy&XX@1?*HMsLiSk zbq0CR@F2x0OczUd*@jB8Z@c2L%MV=puI?m@_-YF>NO112%MC-e>S#(*j2Nfx$HZd} zpQjmmv}NC^DF;4Q{jCMCtCsBq{!3SS3iXLgh1iEs%G1hl+bL%02J7Ia%0O+h4g>W~ z4ZBIAUrA>JHCyDmg|>k=$l$4^L&qVbi)4T~?o)dfLwXp(6u2 zGWy<|e9tlGk4|vVH^_;WVAsVa8Sn}76#VS?o1bQ0Bl^TLy8f6!(FXX1)y^M3Ci|cZ z{Zr_iy9skffH5?f^mS`V^#`V;xL(8Yzd@(lFwc$%m4DqR^PF zXI){S2xk`XD0!8%AaideCy{Gpf4OIcf71%l+)La-@!uX}Qi}7=p|*<1jeuPd%nHEt zs$u{azvD?WG*{cxm41oqe(&$G@BjnBK-6Rw?T{6qHro=S;Y<9yz1A(Q&7R-*eN96l z_B5I8*U0xaQ()kouHs^E>$56?Ys6u66KL+Ta_(Jc$0aN4myN^hlhz@gb9yY=)o~B_ zf$6WG?X#Kf7no8eawY2i)LJu^UTfOCg7kJu5hraK0MSulq)$Vh^O_0l9R4E7J1LX> z39VbZ7=>3R=<`54(6>-YbU?MPTXyfsiV%XvSV~bN4&|}+>K0Yt2{szO?Fe4!{YXr zqh}Bmu4F~Zd0jiFaiRxa5Zy1ajsT9{LqW@_qlZ_< zyy&umtaDwdXDp90Ggr5>j(c!Ze~HE7)1r3EeNqjz3B-Bu-pBsdXn}dnOl{4wIjT_s~CPA~-u28?@0yZXWg>25Xs#CyWUG@TnYR zYp3~)r7Ucx<^0VNdhwl;??TLZ*Y0s_tEMmPyd7~w)=Jp4FVseIuD`|RU~S@?2SUuk zKe`VlqrdrLKy9`ZNC-f(@ge6vy<)$F;vw|8Op^0=@&#Z0g~T+i{;M7(@m)ULfRW81 z@RFUcHt}BZ+;;5J16kFiUVq)3Y4q+`pYfXU6k4*_d99`)G|!5g^caQ-&xe@XJ@H%O zSD1rOg0$q_F(j`FZ@>0V`I$w^?nHA5WTAAU93dtp`D{JIQ&DH6|Mtt2-4Z?%%S-j| z9)c%_?LKYNXDre#7vcpinn4Mnns*MWHhQ*t&0F6Y1#f=qD4-1}a!YBM58Putk}3W1 z!`RO%47^S=;7BH?z}3}T7`WM8vkUfyQ9^7#3YaY$#pF+1;l+ygUS0n)JeH6vGZ%Jn znOnsq!4zYPE;gq=hRhk&i<@wE5rk;Ody7KT7Gbl=)#x~VBgH8pjs4b=C-2p9v#p@9 zevu;H){^2%tG4SsxrdN_lr31pcrkX?C_f(K(JDd}RqV<=3yj$<=1QKrWailiS;&eO z)99E-fhxl@hSA{?$!=PSEK?N)PdY*cfR&e`j1<9HGe6h@bHoU#G3?rCL16``iIJZu zJ1w$t(lm2JEg~zEycb_M>w}>a5v>0%+~}YYs!y(ywPymPtNJb>GqLdbsx)z z7yk-THB4zbJB5(Z$KwRoP~9dix|}Ik&)!N++54Aw=3igMVnX5^G*ozGTaJV1 zS2GQ>-51?2AvYc3MmGVOXbE|0UBSHbIvfBD?a>8_pgVXxc5Ip`;eeTemrT#>N zl>JzVH7z6PR991YU6sWDVe31?np~H)0g);oBBD~F0wP^{Z=xX56r^|Q(t8P21Vl;{ zq=qWJ_YMh2@0|bv5_<0t66zQBUVE=~zH@%$%8%qq-fL!_nR)J+xrg?5xKLau`M_Zq zZ1a7h_ym_1oJ;R_hx4$yf=@4}>URq`OTBmsA}&V!_q>q&6Io-FRzVwY50PU~5D3Er za-0>CgFD!SeKogr5F@8t>>T}_%8<$Rv-&TpK2&+i99|ZQWJ?#Z zreH8@uHdVMSo-aI$tH}tf>X;YPQ4_^`@WEz!>@OJP_Nm%g;~R{e~ol?r9z&qclX#$ zw}U=5Y?U;+7!|9a*J(t@rM?90zlitU_*!x$eJiM8OgkD3@)=0dsJ2*VL61m@)eJAT z58x(cR+WDkJYVG;d_cOt6>{6%wnYMUODz8&5BRldB2=8^nuwuW)pjxW3*IlSQbPl; zXFvSlA_D>)Ck0zn*S1hrEleoa0Q*U`k->4Xb%jfcQ)iA^Df$novIIN9}zYl%uafjLINqbawh*YY&Ca&TzcNAfLNOQ zDm-0O9|(QS$4)a8{!qws#3U{66<}+!+t7^{5JIV6bNSo0wDdVh0Cpk&fNonqWYVVK zPixdI!6gJu{vPSYh`@J%52234q>#kt*->QapD9S(lWaZ+d6hkq!f!;?uZvY%HLcD* zsmO9t9t%Xy-4*shznyBdbeMu(OF!5azURI-9TQ$;*v!VESHrOS={ zSlf5K38ZzVefZt0;0E9=z%pE_`;L4_kx?5_uc#?5NRAGQTclC;>SrVaj=v+kweE2Tj;saN`Y=C9mq*2X>j(Opi!(8ebHrC!v#>|Wj3mr(~Q%LaKdtM zNR+^)W1@s_TprxHztsCX%Nr{HWY|GYC$*m#gNAR1N|9Je7o;6!3*;yi^-7`8@YBYK zihZVnZfcg=6=IPhy_$=-`Qjp;_lQ`J81OM_!H->4RCj4!3iLhloT%YCO8Ooyg*%H$ zqVY2dox^3KetJAxt4f(3xn}^C{xZt^CuUM{(+SJ6YKFgHzgiob%KHEp&&ntrEGjV8 zRN$=r=rXMnw6&ng&j|}wXUi0U^OPE-aS#RiE}`}WH(o#LqhXBkJ;)*0R+_Hc{ z?EsnDNJl)tpE?qw$WVHtWi{H9SSxw~8BuIpJ=MpI*v5>bLB40$+A8(ReF%G})fTD)mCP~QerS7(mKp)HWeDMgw=_RvQIdLC19;A{)zh$W>lgt?E9VJ)B z#Xo@a1X@t}rgOj3)a0jqiC<{%k`%ynG=nl~;2_XfcY{zOU(v|CTU~#tMh`iT#1xyY z{j^#(=2H5R-{^;|HI?(7p)$4al1Oc1_1UhDCsD~ZOXu-_2*C4FF41Y!bQE>h?l~Kx zD{KteZ}sEWo@VnuzHIlQ;}4|K#x9tft?}EmrVN_d(@L%GnE&<)peEc#gJ>~bGqp_< z7#$>KXP@_+D?U$s{d|&DqOE(R*sDb}ySm_A?PaQ*`wugxme7Wi2VzA#?h$IR@%)Oq@z z@5&G{+3$iF7}FkcV&j$o#2xgj!W z`-Lhd<}+0>`05uV;kkkG29J>e{hUw8f@Mky#;Wj7z7~9xFFqm5wXfvR@G{lRkHS;+ zaD4`X*{~1qd>s!)_zRD`R8n30=HuNC^zIbf%BR45Bs=}$7B`@(!dB^JMMax$D_xPH zybq=lz+Y>yTh*lx`Xw?k!8tAcFz`X+Pd#QE)nX01)!(7gr?Fa){CCOUdOP_&=1Udp*1^m*f;XNKjp4`J`?-;p zF#-sEilCzoK`-V?06qMPri&>dvn6|Tcnt^DmZkaj+D^VL*GaIZ-y;Hz_HzyPY(V>3 z7?aD(R8!Vq6fp2AcQqr{=J3)Cv(>9_Q{>>_vt({i+GwI5?*=&@?TmDC-zn71v9KS4mUUm$z07NH-*zZA(HGM%?HAha)!Isnrw=rwd|%d>Pbafy&}flV zaoWGe(p31AamrN6k(>c%^U8C1DpUB0KpjPS@YPnigR+3YhAdTT4hp%oS~vr&W*4zN zerB`sV<)b?luoZ)qFTp*HOX!-TR*$-yS-Z%WJa)e-Rd_>9IZt&7&JF?a68om#W*?9 z9_q*Q)84jVwzNbc_9);KKeC96YsCpxu+EdVxv0g}FraQ1ms8)9aXBL$F0V;-obM!JQo|X3pocnWJ_x5iW=-f zEn={>yqvj7&Sn+CxN^511+4v@5F6~eYUG}kI9%joQXVo zUcsaKnNN7N{N(v%#YJXKqN0BysjVj;1(ElQ1)G?bAYbuvv!z-4ywk%etv;~#ase5H z|9~2rz>PJza+gp+F~Y{VWE?5tShK0! z3pRuu7@oiHNNbQR@LY=?q#J4IXewyr+Ic*E+)yQ>bw+s#C-@8|cyF50*4~BcGn6VP z6B7z`vYXzk%WhHUup8P$qEWlUsr~ zSOVs3l1|o-45_QofZ4hEp_uQnioS%>+OB<}R5+jaDye}*DVnCV`Ij~Ki`D!7jDJFG zHCc@k_Kk(x+rT!)CAS}3l%Hk1S6cW*c(L~foPT%rtJ8)Wl7m10RZ8KWd!h@ z*7wQ#LE2Jo=Q5y8Wj7UC#1y1WEIf!=Ejj@Smb&L>am1_l(yWr>+e>!K(Mf}cZadMH zUW2MQzpmj?%kG(*97eB@A4EygbA08cdNv=Bd0(Ibm{PjbsBvCKn=_qUZ$gJcwJAnn zl8gcC%OC!+?vabpgc{E2*g#O1&Sz;Isl#gN+R=f>73q{)ddL&^AqAkT{V{b+kFCs? z=}3N;p~o+a>JGJXoSNh-#_SzMUd#Lo#^&CR1el)8G`TLRn3Z7;PWaXP%?uUW6; zD-;;%n8uUM>^{8E<(!Mxd7@EoyKN}_m4{KjBUJ*}NP5!t^j0@&!P2&3UpOb~vo8^S zj*X^r+kvDbvL)|JG~sL4ci{uJf~?c%T5y%{5b<-o&^d-Q5g7ekjUCYX%aZ~@q~fPH zCYNV!D?sQ*8ky_IZ@ZU>DS0TlR-keyd`PwCD~ye zA+sszaM4en}!;1LeYOx?DAk~uuWptP90v)Fp^ymfFd$9WFDKT#rN+}=RTVgq~#i|fGEbCRej z6}|c%FK(!1H|N)-UegA}VA9l)xu&mibB@r?f>Ya6C{9?$D5ngLzmAM7`@8aF) zO&!eT@|(o;;GbhKmhPNJFu6-pQ!A22UAkX^m(et0pyhf3i*Qy;0({bFImKADcFn zbMt$mp8Li`)7MNp`P8u&4qK|Lbmxv3!^~`tIA#YsS9pBoInauVH(dAJdz)A-wS$oJ ze8|wDUZ^tXm5cF8?TOdO?>8&+pLS&}UOz|HUag>tMIOd-7*vUCf%46od(Kl%`g=#C z$6NwO0alZ+-_>aL;y)^ql_0cK3Ue%HEWOD^pq4geJB0E|k#(t@m2mSREEf*R>1ghb zb3@^xMfSZ;(zLp{Q7X{!96CRPZH)B<(Qo+(|NWfk35_ZmKiPQuo{p;-_l;K2Mfk0N zV=(WbVqoX2noXIDE@|66mkVsK+f(jsBKRM^X|B&3P)tGw`?;!=ZhPWZE}7AOpfgHV zWT>*@q6ecZsjCxfxn_lwe>0-CVoBJ1x5t-tV^~+A>VY0H&%kGJ2mG-j7EMtRu?`;U z=VTKxwODd2OrCJj*TpqCCSh^zq{0a=%;{j42(B5WLA#t#?^I=bj>pS0h$O>%W2>&T zjb?m;_r_VPLzstLlFC&t9VfLmWbBH%YNjjD&RV<@RSKy#j6#NkB~AG#kPl~5+QfwJ zbZ2DwPx4lkzE8n#IO(@tG&ZKSb#d5Wr^(q*^Cm?#`o~3%s@^AuP zjv3$&6$nm~KNg3&t-t zH9&Jfv%}?=sNNT}m0J6U^mGa2Wg)+B&N-ibT4_8`^%j$Pal45)IR|fEFn#9uU`(Tq zr>joqd%&Y5<5DPFqRDVEy#Zg_uoz6VC9h>0MaPcq`}aSzS7MyfGW<)HDaZgGBPTHn%zsf0Gr#rNZ zgY%oY$M*sBl^U+Och*g9r6soz;Z8u+lc(n)J3+T*zxkl|7YdXQ|ekSg;V|CRvc`c0zh9rW!vhOeUnTr%hkuMcGntTnTy08*r_ndBA*M ztm@$%l59&4wniaUsOd$+PoaY7PdbS|xPviY*dCvK^jpsRQ+nZTYQ#;`{qRsi(%J_2 z1a9~4L$u`Q#iv?E?I5{o8eoA4FLUG;;#2I=XAmXAB0Hinn6O?=CQ}fY%mM0kO11fO ztuMj*HI50lowoaXdMzT z@6z+5A#7%fVeQ^$rRyG01SI4qPjg_LUi{>6_+20LYw?YL?sf3DN&Ld}l@ex7=|}xv zH}Dbwzk9Hr0Uq{4b>2r>S4*9E_z$^SE@-m$82_DtRfF-bF#&WQ+zejd_}$Fw5H;DG z2Z)!BvtiY!rioH9A_0!(3SuZ><(x@ewpL6&K{?Kb-B;h@w%Q9H=%aP2%-SiS;~I&Zj7`>VjB4{PEA zwll}(iQ}=&E=Alamv26dsMH>D&&K_54Wzy@x$6P?#APh5SOL~jGB}QstXbyEkEpyf zXJo)wb@9Q}6M6|qD-$U$;zeEs@Jl&G&FPz>SQIfz9V2m|;boAmbS0|*)Ecyb39~+~ z>DN9HL<%CA*sZfMFfttwLLT$ z(`+>tWGYl484QIUgzS~s{*G7H!&0|Ux+0*6GBCgqCsSaIcV=}GCiYG)cL}iL4Weq4l1SQ2SNzlsVs5$ z6KT4c6Xtg&_3e&})%|-gwK+28OYxBA5~IhVqNy@71(PJSIBVxdVc?t^J9a-lC$i1+ z?kmZPi|9(s>HznkJ%KRm^w=-#Es))V)IuW)~i}${WRwH@(LPp;ul$|E`UiV4>8y z;3G^{)_*hd7i*k_dR0aFDc&Z(P){OZhoo-sQ}!PRm3<$PQ`6~AmDlsNE5C9-Yo#s* zYA#IMqGqcf)P5C39l!2(bsvToa2S}c*p>Wd@-7h=ljnL&XIFGbqTKAk%#3)jjg_hG z44x)7FI?`9Wq&U?pEx>O>jRi%>lQO6ZvMrWa{F%fdm;a7-qlapAAjG)Oek|d9u?P! z+xK;+y}uB*#c^#M@Q!`D!iu_n`vr}je&~FpVBy@ci6;0C^P!Aab;x~5I3`K8`fEq< z6Kj3}?Oyqwu6d_N@POK#&rOb1@R^|T98RSkeLYUX?>O4Mw3l^@e-=%-f83Yi2U2!k zd{&>&fHL`BJjfyGH9Z7O?NRJs>`LuA)L1wGU4o-5EBHT^UGq~!o zl+wq|RW$`wtndR&`9A@KGb_eD$Q{>{8bvB@R2}{*2IM!OgX23@+@`;sd{aFv-%UUq z=F=UuyB$odlt*A&dncZXim*<|A83p8y77LFhiD6GfOfWrI>GJCO}5jcO}1WUQVs#7 zzU4!U)SyFAiN&WNyD33#3L0Dnpf6JTK)hI^P?KD%-bU{|ex1`B*wGi!(Of4%8A}U8 z{Zzq7)uZElcVixvq!Z_R+5|hMYHv){5%x8&q&P{hD#{0!1g33QG6R&kL055+ABKpD zjJ!8GGIuQyR|6f5LByh`F={S%CHlGX-ZcSgr?#PDY4L&qIDkOKnJorRaV3K({cr82 zD4@cDz-;(-)?r1|h8*!17eOSMK?0AFt)9AgLg+yZ`S66Hl#UpzdU7=8$fKA2?c}(e zkmRJriD?3YbZ#33A7phs&$BtMI(VG2NBiaQkH(flsqd~wq?uCTz1H^^I_qCS8Zw`J z#@#~GF6Wg#oj*LSH?lRAs4#(%V6k9cVi7SeACQ5+?^9(XAv;m-(mcezkdC_W#z1KG zS6auGL4xTIz2eKS#c-WH<)0QydM~UYUm#ybNz(OSH4iYhpDW<^YLcV(j^BiPMl07V zfE%3_5Be(PvuVyHF3~Hz%}-ktd{=R;dq$Z}4i?)9Y#rEAfbEpVP4$}(%nxt?dz65p zr**kO@pwEaS6E1eC0Lm0qcr(;OPRHngJMPahuN+7tX@`eg!EXB z+Qni~6H7bIjj#5YA~I~hh=KL0yXem5a_OH>ToWNaa@{o8CGtRr`J)J|t-InGccq?D#{KPGFa_W@_hSvs@V@K4hF{-z8YLpt~T z8>$uC4|@niSnXpX*07yEFAa#wT7^kK->U=3d@8`Dmr>uy&eDSdh;h3Llo$MV>hvY> z9ykb2fulIVOBy|!c}s=Ssv`&L4O9KKf@&9+!}njjFT(-aqN(l-Z#YJk=q-cVo%M_MofUA2R8JU1fFLsFAt%i* z^8gx$K8Pvf(>y;!*me31;s{lRLQk|_S`|VYjshMWlPWP2Rb!6GUnjY zSc}by`V)QIA5Hh|J@GlSY`0sf=!0eVJKxjMUMpd?=+>P0uaX<>T!7Vd$kA}e-g|q7 zZ{_P9QAeV9LHJT}+?u^tX!g*e zM9}H}Nev*F>sY#vS$D9&e}`)6uo-D{K3o$hfme&a87?0?{Jj3RRIsaz4g5s}Dfmn7 znAQg8Oi>A5ygRUvkyF;mNCvM-Q9>tUUrTaW4d1vj;@ZXbtAP9A>&X$rW1R+p68io( z%mAsgq^;g#x~a6e;DnWnw?-9Aq@Jeyij$>ZqDOx4x#S4wa{}~4O$Luga;|F4zq8<^ z8=z^n6Q#M-A37|t_NCO@#iu^wM)PXWU(uMtCvmw6NSjD@XL0j5rT9a?OFp<-oie+F z5eV`LhrV!K41&DU%9x&)jz&p|oumb6qIHZTi&cu8n{O-5cm2+jta@Cg%(EB1f_%N+ zdcm_ai!A%p@%=Y}he`3Nr=a=kWI5bW0AH2`NQv z7X*?hf&vd;bV{13+)Mnev2}*`z!FdO#+>5&szD>#|MTxWF=3D7ieB)O@j7Nv{mqq?_o1nE(Hle*>ovY^;9ixzbJOxyrJclP(`{8 zn@*Wq!uL zEscEXtNDa7CG+sxew}~B)=k@dbfQ#q5xAebxKFthF{=Fr_q?GKsB`2hTK{UN#f+Be z`O8A^iha{Tj@s!<6%WNk1KRqKdZb|P& z;@rj)3GBXx02hCu_2Vs2pYj7dcFhu^M)!f^309R`KIh z=H0KO%8pt?D+5j;Y&EGHUdGs6F1@$>%v%aAHMm|rC!+f-;T;wrwD%RoKTWjnX>&XA z`64`QG|LtH6eGj-?X7h zwdeiuQ&4rttPGLy zC&inu5svTIOMBx2AV{FV{)Vek-g*I;Z5Abk7Ap<;{Bo{B@MJ6cdCGSCMFGCk< z{2Lj4xZ%=|Ez0z2ZW>gvvehOklk+_XN)cTnV9-Z32q>RcA+2{+Nza3mgdD-48}ai~ z8dK?pA>VV;Zji)1NfUj;-Yi&4dmbnGcG^Du0l08ytXac*V~tv?i4W1vf}B=_s*COr z=X`7&!?rCCxC++99t#^xsEgo|-LAA&wd=0@P}1I{^+d1gqe_c>S^|O~KWVelW%!I8 zJ}g{4Ag+vU+N|&BAvm|HZ6HY5st`id)61WY*>T-X-{|B+_&#I?;{WE%!EdxP{*HrA z&+p01Oav*fdXz%;o6~Kc8B!IvE|9|LV3?N>nu6{2V*#7VUs=x)KT(}u)f~x5jeiSJ zD4i$Lx|fjIL{@FtkMVS(e3(*SrnfsVr~Xdoj{qxVh0j82!RpJ(lb$6aDsTBQp=VaR9}fz-_ZKMq?B=jVEV&&gEDNr|77qx9t)bi%_SdYh@UENrAWy8FNwzaY<%_$P1|*x(K%lOsTqC|4yBOFH={EXFeR~ zw%Zr-&ss(qx%YfKxU(DTxl=LS)alfn{lSTe5#d+tWk+gZgJ{;*RpgZ zn*!fIt2arvXBaV>v|i=#B!(C4@p+B@G`Z(pvzQh^a(ee7vr_bcXhyV#@^ zn*HW)v6`drKcW2qGRryA%ftPs2}#5+9iTqXVy0RRA8F`;g1gJR7-IGo+?S3}2Qn?H z7g0mnWPR$;Ruk>FQDF>R&VXZ{M@93q$5ivp3llUR3|50o)Z22zAnt3}HV>G@{U`yM zn{w5@zho&XAb#Kk8bri4vXL5+-P_*n)1kpS?!fX_#wGq=D+$UUq&Uvgu5F{M2zC}1mGhNe)e-UnuY$Jy;9ld9zUan=q5pXE>E$lQ*$IRHJN14$s2HE@hje+@Ld+xiu&r#=iUY!Jw?fQ3qQLu_> zay!pK6t%^+Pw|eJs@>N`_fXA04y_;Z!?_L8j)$Ia>Texkd~Y$hOBh3tgva)2p1v8} zU?C&zC;@MNAzPzF+Q$@%P;8^6x1CeD=+9V{-goOJv%&N>%RWr!Oq7^5>{dv+913JQ zzF+2B`g{%% zhY^IM$bY=2I&j0c--=Li9`;$H&(_mk)Qp6!BrU zs?B=Ozj+J3AfHz8lq27>ARuj*p|#aDo-U9Cf9l0@R+h@!%8a~$ZIYkS{hdy=18T=` z^-r2YfpsR$da*H4)XuwN32)hg34_3(a&nNky77%nK7RnT4uL5Kf>< zBx?@Lm&sr~oRdYoBkP#$cTMOd+)Sd4}~) zdW7T_DSz~;$_MZ{wAnE2AYmXjH2HUJ&!XK%KCddV=MpXVmepuOeI|@ zCl9mliDLYuEyWR5j5;aCU3)1QPcmW`fmGO3S~$LB($;2WN#lL;ed#sb;y;sTk4|q- zi;6(TN}!P9j16_h^&me*9{Y%`2`w5J+sxt?wbYM;WuiM&xa2P>hvyqT9HuIP4#LsR zan4TeXJBJtAR~U$zlOBXjL1;mF8@2g-9W_3FrV4dGi@XjPN-Fl#MgC-`Jv2c*0B zcvq-MfFtk=Mcm%h>wS{ttrq!=cm!9LuXVTcxmQ#craU(Oob^)q_FJaD=ZzANNDUgQ z0;2{D9b@c=fU-ErH)EjUcg>j}5k>~DH?dTGhJ?wBfDhLr*K)q_EVfCyWY|Mm;8-%k zA<-0kh9yk4$ipG)Ro?>c&07tNEk9j59{wgptPkZe``}SY&N@<^=N*)`xs9Jy{kUpx z-Lc=Bwp(O3jZO;h*Aa6N&{TH)9_j>wdCnBzC7}Q4tMq6a4QJF>-*u6PvG~2{fU#Va zlXsXtzjErGLHWpGM2-qDz7?V>6&~H)~(J zvSh=o*10%row_)TY>oASb)JHNkncbvZW-{@=u8YpBIaYB&NNLNlfG=Ti6qv zTx69(fwGmIXN6O~bE%}>Tz3WQV+V_^Dh<+PJ(tvO9WEVLVnT0m4qwx#eu*>_7&P7~AtfqPnCJsEn)qv=L0%^kd}oK5B&8Kfc}11iA|hgm{2G9u(#rjiX9wEzk*P5FPAD$|MxUPS6nP5aU$g zs}R|EBGy@<&w#x4ePjjPw!OY3FA9_jMJUy*@GWQV|f!r{3E>jaJTR6KVP9K zlPbvw?q9aJVwR{k1#eD6e9w>ELQgE;xOW}A%Jzp`8r{lmQjkJbv8;gvEeA#(Yg5mb z{O0~BB{#S0b0rHLFHhjV-X@stVSKQk%3sthbpy~e6DirW0KMgBfxB3j5Y=+=)YyLr z*aEzAOruad!t6Oxcn)=an}qKNct}XbOl<-;VuTAOe0y)r7h6zxYAx}Xa9+JVdK#6C zCze-bjwec3Aw7)ZKq@bOXmPoEcr>fF29WASgoVOx?jW8-;v3^#@x9nf%2kuCzx0$q zx!LiT{|rIha9iAZ(n2ho!n>VAAgQ9w=EOT3^UqJQ5Bx6z3@o7lQl=t|!Ir|4Lb%CI zhHeJ!XgO7qC_xY61q|D&F6y`W(VyZc@?j?gr9nh=F?Nr)eCOeNh@UAa$=SEEbB|}< z2J-z)sL}HAnd4HV5TS~#5q6N`@yYesmct7Skvp)Z@oU@7#M=K)yP-sgcZI(k#$bt+ z&cgn^?gdw&G(v`X4P9OT#hD6m!0i$fXw=#*>)Hh(B_X`7A{}+SeaEd7d%xbsukHW% z!Iw<`Q_;2@$5gQ`2R~Y zg9quj2;Qt{b)L^!5XH*;I{vaxLypZ<(EL$Y1Kb%uha&KVrI_sW;{5b5YG`|!c9H?; zTDJGf#ExlNfR_ag_}3)`tC|}=3~%Xxv1zanRM>X)ZsI;tELWlRR{@+mTK|DRMOO2l}!s#GtJjHh3d*AX@mKGeKjSxjQ}o4u~m5&?-e zU-?dL?gb(7+Zm)8weQ0It@aYi!e%~-kl{|QxVHg0kOcppP!Jt! zF^55gi_&;3qGT?^v&FH zKd}3}0W($QciB_ZV7%hCMItp>q*HAyS!eQ3as8#yoX5CQ3lbx3P{$D%Pl!fqVjbyk z_I+iQA1vXRU)DW%x0{?677QptwnUq~3C0;yK$Pb0x+|z*@Jp|Uj4nPW}eNejT zYs_4aJ72!>ELdN|gip>O%Fm=+()qW#R`20{GWQd9<)Snco@3bGr3But-fF*J_0-{y zQLc@)otVci5x4(F-yeF!nPzCO*lr!;K4#leibfKh3%DbNim-I80(SiCZ~Z8T&vKi> z7(6etJgBmgEl2NaS9yMz&b}lh|H`a*$)ZdAjpi@G{O2u@QXbj8u(%O*CFa*o*gNO) zZ^SF%Gyb7*t$ZVkJ9(9d7E4X|(HPHd*I#{XSg521({FSY9Y09ou6Hixu}uWM^1t#% z?KaxQZJYl~1!r;rEc$nwR!w0tBjROqLRDPCfkZ|6^}n|#GJ^a>v+)0yM``)S=!m5l z2Xx!(GIJW$*tR#jo&NU;H{Lt}xlx^<_@lKM<|^EF7f`sX7%Fasx4a zZ7h-}J&tYhS00|{dj^j*i`nKG<%iED_PlGS{(^s=;A>wKd9zPf|E$>jtY!E{t8p!= zCF@ywSCY#g`2TjY5rlXo?n}jK`wSo+7j7&#^cP`{@!{S5HFH5a_1nfo$h_ox+0C@7 z@9ep()6T|o-<{B0^$&fmHNc=M{CcTU?`mSX>*x33(3mmo4)B_7nq2#D z&fZHDkySC%da?BUlt-@^lBoYF9yX^U!NQODDxQ0Djojyc_GxFSvXvz>f7{Dnav_j5 zhPCl!^Fudo^5cIv=zb|YzdwynSMkdgNRZk*DaKjKJE0T+E>*$r7Urv!7c1Pp`TVDU z()>Q?XfCV?I%x6?UGIPq+j)N(k<8sEQ^nxGtWY1R;(0G@5mN!{V%ejKoRKx z4-fd*oIGI6cgT)>)-m>PDGc8YT-3IawXmcW!xu+NAb+sd71O!>Izb!ulBT4dY+J^2 z`COO+*!$S@&xQOmKczbOoQ;WX8amxre>wnvyT^X=&$zet8^w|HgI068Kqpyb-BJ{* zzpOZRu3*9V0UGt!k^V>ve_1*z02`_k zh)DWh2nVsnTARk`p5XT1d2@Vyh9qq3Gdde}?4tMhb!J-z+?lCfko7-UTR0w5<%VmZ zgFK*OKIlz~!kM2go9Ed2w<5~a-;!5*zI8h`?6onx<{S@@{TCgS=EM=JoU{!SEe_MB z&o8yD%xoRQ9#3olA;l|%74>56>tes;;uIewWT|d9Fk*92by}r_x0m}ro)t@UCY=r( zN|i7WF;-4vIR?{8^n(LI_}mFMSzihhIi>#{1#X;pCwA-1haw)wNrUZ2xD=j30ZM^8_Y3ia&qfN7Q4BOuwkbaE z*7QfBu#tAZ0C5qyU$iDJc5zH2Vs(-1KGn?G)6( zUu8L~mLY@4Bn*mZ#Ob^KyTtnMNFe(P`4)Aq2lUlszOGq06@rcxUZvPznz#Qp`OF|3 zmL>%TG?!yS6_jD0!T2@xY}Bop;T_PT(^$l%K8+UC)ANcYCR5Oif= zK?8bKki3TFK-Tu$ign<|tTncTEn+JV{k79SYvjw$Hw#O9e4Cb|!`nI|gHl?cD{-a4 z*a7pB?IY{ll*czjFO2B}`3`t<_>~JR>sOmv(BHqxZ)2$zt`GaTw}wqIP2#CUWvFt= z&&63ZP62v5`3#fQ+31v2A|*CFeW5P1y?x2egv_?KMvFg?ZPzg6P;!O}jT@G=e*{77 z!EX6M`J}tqx#N3FVW|6_z0{;nw}t-S(dkAI(rMU?qEQw3iRZSHTI*hdhyJEiQ|YCl zg^x-z0@xNpX6`}GAg&!p^SfoK((6}EZoRB@tM&YdGER#6v{S?SZ_=YDLV@mhyI9vo z@z@r$@}$>!d=DBI(d=Y-tCZW?*@!;4{@-D=74V|3WX4Kkp23h zV49}&H_XbxT@qIZY^xaTF5I9Nu_n8QwbVtY>EWmWhQwWKXUZpz1NnBNI+iIv@3JW3emkW_Va=PZQ)VA7t7grCDvhd{jj-Bq3i(y|LJF^T?M)jga2KCRI9NQAd z`1a?=MC1d&@@Tr|@wCspYsN#BfrRdo33~cJn=?~RtAJSfF!&%d9M|_3%9uza=sYgU zW9D@B45ngI2xcoS9is4sxNn4rvMdE8_N4ayv6xB9ge~J%_Ux9Q4no^}$^Ult*v#vU z$g>Q1++}r_LIOF5BXPURq}rSosfiORu3>L%p`rshh;;bMK7X7OCI8EGA8>t;bK&v_ zjJU@IyxjVuXoRX51%WIxxkr4;@rE?;V zXj~fAibihl$(8NxFqj#rd8TY3EcW8h;HK7(IxPba#O02U>`p2W!dN}6u+`YddwXN| z$>c{OPjEBrdyRy-rku2#2WFV9&+z)WYU7tbfyo`Zzh?tSFqi3kb?-WXE z#F#%SPBD0g$ngQhuM$6NT}B=G7cROVWcHqmR>ui0(BEdj?xjYl8r?x){LXcM8u=fd zrjcJSgh@S~zbGPobNJT<-ME)>SgI;kRfKNdSKfH>?!FChF=Slqf5zjNt3+txJsv;{ z*DlsCl1lTlTT#VRU)0DvO3|$1#Rop%^}We<9Q?QRDn0J(l(+^VPr%o3>V+r=KlY2s zEiO_A|31AoOR4i|azB-zw96$a%A0J?5@acM_iCZjpQ!sMk=f;$!-X8-b=^mzDtqib ze%1i53M`nSwRILLG*2g7uC&-Z=Jaehugj~;Zk)L0PLJodTO61tIeyzgzkV0&IeIfJft{;`JZFE7|IIyFbJQBAG%P2$c;A2XwhgTbo zd!*-ZIm4m!-J`CwhRvoNUJ)E{;e~r=>t2o5FEKARw=oxNDD*>{^1LP3p~`uM~z}x)lEmtyH&lml*dJQPLyL-eRo1 zHboox8x=9K0#G}1RM_2LLcwW}^P2tnG%s^u-<^j0;IaQ?0sl?Y|2Ns^Dh@7`)s0-5 z{%I^JZ5aL2Cm{HBciOmcDAbf8IA!O1-Uq$tE8D;faR<$@Bs;zm zwpr?o4#qiUsj($atKa1_F9UbRH(!|dc=*JBVbm|f9ny1XZk|;C5=NqVuO%WVr0ND! z68)07pW;4~pKYi@1l=;&+e5O_X5!&)-f)jlxMzmShq7E@KYLuC-PKU{R63#Eu7;c|j-ojt$dylXKWjgRVZU}Mj6?Jx zVPy01`gntxdOhCAD5DYbjiJyx;~1}d=XU_6Do{0rBUP7=?w|a@gV0}*hF18sJCQQ8 z4^15iUTNvqo>hnF&b)|gyxmwlgb(pQJ+tA!5*^-z71t-oYxHJ78~cBPOpEptBm!Z| zqqLj#cW$O6f|tCH;;+B4a@b$_5CuE@AY+F@+a^n*25g6o5fZiFu%zrOs?~c@A z6`*IHte*Gcy!kYwwh1$^fP&8i&ZmgEuksFgcJloZyE@-n3*4r-oK6cy_{$$iS`^;( zgV@!LeDgLV5N zQeSNj<>U0AN$Y{~xwWS^C9E1e?w^OM+c5&4Dr)qwfv;~vp&d6p(lU#^{@&|UiJ#)d5w_gtSTOpEWr zjuB)K;E;;fWgXS>ixCN4QJ2JRI{!;Grr`k&sZl$_^-KTJ71rIqAwom?%qVJZ;5Z{J z#B6W7A5!r}>{cGb_w}9Hxd#wb^`b$~3nt-9E&t1Mj&C&?;}knH+dtQ!&OOW+#@-3X z)*R)9Isg4sj)c1BEX|9trFX8ohA#Ed{w3FN$35(7)*v*kU^2ttWqw-!$vb=f8O-`GuKrS(CpE>kbBy%|$}H)`?zs$N+@dTgkN6mL z2FdOSdXlG5P4e-bF1Id?3sWz>RFAsVYGb7l2B!HWfnVK9ZS;1kV+}3$W$M8mcdaUm z^n=I5P*W@wO;C%lDS>9q(Ec4h-z8*}t*^M+{PF*z>)qp-{@?%cO6fo(L^-T-O3sIz zZF(t^P^l>Ah~#``HbSTzRte>_B&pEaQb$EiAU71R)q$!Z*;fn3f^a0LBN{qw<=<(5y{ukw#n{!B3# zl{`|<|NDa7OA6YHMMbdsq$u3Ru-GcWI7Rk&I3pszK_!dn|F*Q%1NV@<=DN5B(^%o7 z2;@HfFMNij$Ge#URNJXzbCWHH4;+7%cL;l8qrq%SnTv1vnvc&#Z4pJolmk-KV`u#C zs`Gz%0pmNW%JFPHRZTZBbi9&K>DtqAf-kDVv_^)MaA)i|Bq~BTQ|DGJ%P09*7FquQ zR39#Z=ho9Pk&0RD!iyvo&ly#yviokA8RBR4Hmp= zTYo9<&`?Us33sxfpD50F{4d>p$atlP2r{%MWbo?9Y$8bBz3UM6k4vr`^yDDX$5_i| zvfJy(5l7&m?-`Wq+&VsbRtIh8W(FqP+QKHzjA!>OJv!7ZKKJ9$`@=0Or+tS4MTo^? zbX3JLF(b)huvw z1wSTC1bj1k3U97w#M62)f{U(fCacX_F%J=2JEw+$?|9DeQhNH|ij58^K6-MbqN4Db ziO5$4zuc8<`fCe4O3q#qXdCanRjoVSXPkHvX61iNL!#5nwze8@Uw%KA`_Ew)OFV$k zQ|R*O72}Gp!5_PI>`)lgtW6KB{{Py?k`_@AK2`hbFbCKG>?3Tl+GxuS!#n`toZM*-VwP z4i~H=tRgWR{TABP$+`6MAdY9lOH=21k9&*0?a`U{+0A$WxaO7F{ng_zT5~${Xm#15 z`V+P2jU>~8HVszKT;RB`x%O-eV()F#bXL?>Y-q#og;nn9k?Z*l4~>UINud+${kwlS zpFY_g-d3KQb)DfM4ZwLpP1t&A!}EJV-R~WzCjZpN{nz@meVWfc<#7i z#!Vt;q(#WrXV2LKms(E79dvA(T@ZLIyBIDyC0X2k$7}xVINM-Xh{S5R-v!qQzSH3BN|AR#BO^ zui6xTP1Q19$VQkP`lQ7l+9`)f-Aymr0;ICH{ygBEvBNH9dN!%jy~-2IhHZRKVCCN0 zqP9xpciM7V+e2Qkyr$Mx@WSUFI{vt)ip%leGD>+gK3FzL%$h|P&Ng}eB9b+)FV142 z8|DO4u&WLB(a6vxJia4J>S-HIJOxvAalC}X3y8dmH@&Tc{{C|3SYFal0S9ic2MG;f z@dofdo9APoXm!Zxt|zb)W;VWmX~@eKB(EF%t<)ow{?d!^guRRJ`&5;msKc)S1+ zoodT?N2-*;iJF3TDH#UihRhAI>0M%|U92OZX2H>6Kg@=}kzz8!=pG0b{=pMjndW7S zPQ&sd%aV&3sYn4v=`fnra<;T`TRDb5vRo^>Tl&c%g!mDzxm-_Dukt^;o0SnvcC&l8bzPb#^z#j4@ zYI>T8X^Mt2F?6($Cx{f~-EpTiS_PnaAtU&nkIGfCBR#6vW2a7$?MH%P*`dT^zb}0H z6eI#e>~qCzU!1~C&uRy*Rs9d$&Z^VqKg5d9G4>f4BN1pPDX__V77ae_{4BiMWLoqOMI^RgO2ntv99qxb?xFR6&Zyc`h6)ZQg(f z?a@~I2GwKG+je_FBiocbtMw8&-b>==zXjdl{W%z#y-;&QJ;VZz&aWTPUzAqW+2W+uNo%@HmqTmv@Q?#YgAKpKj6 zU}tS6KHSTDQMZKzr3J%2|T?9M<9?*;`79+WwQb z`d?`RRyd#g@R`jEvln>|QN%xsXcR8AbzTMxYJHrC!W;rlx8Ez>C?W4!h9nA@6G41qTihH6(dX z$wo#t*L=%f=bF3M@A_y=!$kYltJXkr6^jJ^${KcUlCbr-HSe>K%+lLC^}(4P#5=W4 zE~`ehWs}(6+sfr9X5yA|EL}L&yyUYUJrDQWWo|deRh|8|`P3IXo+(iyIcBf!U*T$U zKZ_%g1P`)fu~AXCcceg;Tj9&D7|y&R8v#?>S}ju88r+KZoAe_G!=+$Jd+)v6ztftL zgV)m#TssEh1dIguZRhRc5I>MQua0>y1?#z=HXLGQCbuhe3;gPaDXtG{@kwj0k|*q^xfI_nMU^X^qxpp+SF{Cje;HkCggNWt{>+yfue zt1#iVzo@)c4IH+`-=pAqTPbE?%8<>yT|6nli)pnzn-9EQNt~!s*$KdnWWcnqr4tv9r}(ltC(CK|BQupcDHW+v*Ql9u9 zV3JsOdw&g2P2W>Ayz1pE)O+^xchUR0r{$|C{ob=6hJNs%SMB8tC~NlZ1T3V>vBB7O z#6|hi)*mLQHM&yCIjH|owK&* zf0>llyJZR&E(;OkAI$Li-|ywgJRIq|*I}`sX8Fuaa)pPmQ8GM1g(YE6IOkDFZSJ9` z8LSACamkabq(D7b5R_X?$@H9z|AQmQ<81@n8}|;_9SqYx`gl(@MG1Og+5Y(2vPDPi zukIcg-`*!x-StDdkGT0qhP1v{UiFYG*SSiB*)gg~aCo@ScHcnHvBQXDja`#Z^kq9R zbh}`WWc^mg?+DtiWTEw3<_-!!wL3ih-rIOr?TChn4CZyVLE2HIOFuZ+RyUn}l53VbKvQkKLM58V3V3v{>WZAAkIiBn;m^ zWi9nB6rV}#TFJNA@)XiNjO73D6}~@3VyIPxZRdtgLxy4Xxc#0cKEvjf$cNARSMeU> zSU4TQqDU%s9ayCe@rkzxF#6)q2rM|1`gB93Q%Pz&oP`(ggvBPq9T#54i5I-JQZKm@ zoLb+di?Kr?)fMAjw7Dd+4!jUB3M7TB>U&;gIEF$7yQqJ7L$bQMo1u6#yqvnu^el+| zz+{=(31V>eX%}}b5pN{{TW|xAHC*2LWk{x(Czon?|NaZw3g<24otCFRvLcXCz!`+ z_%|K?;jabJGo+tETpdCsmmfYJcKXc&@7u)9wXYUeeIo0{BVi}f9zFC&v$)?s@H