From b251b4c5b2d2d90ab0f6fd1cafef0556191f0c46 Mon Sep 17 00:00:00 2001 From: Shanicky Chen Date: Tue, 5 Mar 2024 18:25:36 +0800 Subject: [PATCH 1/2] feat: execute auto-scaling in batches (#15420) --- src/common/src/config.rs | 24 ++++ src/config/docs.md | 155 ++++++++++++++++++++++++++ src/config/example.toml | 3 + src/meta/node/src/lib.rs | 7 ++ src/meta/src/barrier/command.rs | 2 +- src/meta/src/barrier/recovery.rs | 4 +- src/meta/src/manager/env.rs | 9 ++ src/meta/src/stream/scale.rs | 166 ++++++++++++++++++---------- src/tests/simulation/src/cluster.rs | 3 + 9 files changed, 312 insertions(+), 61 deletions(-) create mode 100644 src/config/docs.md diff --git a/src/common/src/config.rs b/src/common/src/config.rs index ea4354f9b23ad..2bae32def8d30 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -229,6 +229,18 @@ pub struct MetaConfig { #[serde(default)] pub disable_automatic_parallelism_control: bool, + /// The number of streaming jobs per scaling operation. + #[serde(default = "default::meta::parallelism_control_batch_size")] + pub parallelism_control_batch_size: usize, + + /// The period of parallelism control trigger. + #[serde(default = "default::meta::parallelism_control_trigger_period_sec")] + pub parallelism_control_trigger_period_sec: u64, + + /// The first delay of parallelism control. + #[serde(default = "default::meta::parallelism_control_trigger_first_delay_sec")] + pub parallelism_control_trigger_first_delay_sec: u64, + #[serde(default = "default::meta::meta_leader_lease_secs")] pub meta_leader_lease_secs: u64, @@ -1103,6 +1115,18 @@ pub mod default { pub fn event_log_channel_max_size() -> u32 { 10 } + + pub fn parallelism_control_batch_size() -> usize { + 10 + } + + pub fn parallelism_control_trigger_period_sec() -> u64 { + 10 + } + + pub fn parallelism_control_trigger_first_delay_sec() -> u64 { + 30 + } } pub mod server { diff --git a/src/config/docs.md b/src/config/docs.md new file mode 100644 index 0000000000000..cf28f0ca9e2c6 --- /dev/null +++ b/src/config/docs.md @@ -0,0 +1,155 @@ +# RisingWave System Configurations + +This page is automatically generated by `./risedev generate-example-config` + +## batch + +| Config | Description | Default | +|--------|-------------|---------| +| distributed_query_limit | | | +| enable_barrier_read | | false | +| frontend_compute_runtime_worker_threads | frontend compute runtime worker threads | 4 | +| statement_timeout_in_sec | Timeout for a batch query in seconds. | 3600 | +| worker_threads_num | The thread number of the batch task runtime in the compute node. The default value is decided by `tokio`. | | + +## meta + +| Config | Description | Default | +|--------|-------------|---------| +| backend | | "Mem" | +| collect_gc_watermark_spin_interval_sec | The spin interval when collecting global GC watermark in hummock. | 5 | +| compaction_task_max_heartbeat_interval_secs | | 30 | +| compaction_task_max_progress_interval_secs | | 600 | +| cut_table_size_limit | | 1073741824 | +| dangerous_max_idle_secs | After specified seconds of idle (no mview or flush), the process will be exited. It is mainly useful for playgrounds. | | +| default_parallelism | The default global parallelism for all streaming jobs, if user doesn't specify the parallelism, this value will be used. `FULL` means use all available parallelism units, otherwise it's a number. | "Full" | +| disable_automatic_parallelism_control | Whether to disable adaptive-scaling feature. | false | +| disable_recovery | Whether to enable fail-on-recovery. Should only be used in e2e tests. | false | +| do_not_config_object_storage_lifecycle | Whether config object storage bucket lifecycle to purge stale data. | false | +| enable_committed_sst_sanity_check | Enable sanity check when SSTs are committed. | false | +| enable_compaction_deterministic | Whether to enable deterministic compaction scheduling, which will disable all auto scheduling of compaction tasks. Should only be used in e2e tests. | false | +| enable_hummock_data_archive | If enabled, SSTable object file and version delta will be retained. SSTable object file need to be deleted via full GC. version delta need to be manually deleted. | false | +| event_log_channel_max_size | Keeps the latest N events per channel. | 10 | +| event_log_enabled | | true | +| full_gc_interval_sec | Interval of automatic hummock full GC. | 86400 | +| hummock_version_checkpoint_interval_sec | Interval of hummock version checkpoint. | 30 | +| hybird_partition_vnode_count | | 4 | +| max_heartbeat_interval_secs | Maximum allowed heartbeat interval in seconds. | 300 | +| meta_leader_lease_secs | | 30 | +| min_delta_log_num_for_hummock_version_checkpoint | The minimum delta log number a new checkpoint should compact, otherwise the checkpoint attempt is rejected. | 10 | +| min_sst_retention_time_sec | Objects within `min_sst_retention_time_sec` won't be deleted by hummock full GC, even they are dangling. | 86400 | +| min_table_split_write_throughput | If the size of one table is smaller than `min_table_split_write_throughput`, we would not split it to an single group. | 4194304 | +| move_table_size_limit | | 10737418240 | +| node_num_monitor_interval_sec | | 10 | +| parallelism_control_batch_size | The number of streaming jobs per scaling operation. | 10 | +| parallelism_control_trigger_first_delay_sec | The first delay of parallelism control. | 30 | +| parallelism_control_trigger_period_sec | The period of parallelism control trigger. | 10 | +| partition_vnode_count | | 16 | +| periodic_compaction_interval_sec | Schedule compaction for all compaction groups with this interval. | 60 | +| periodic_space_reclaim_compaction_interval_sec | Schedule space_reclaim compaction for all compaction groups with this interval. | 3600 | +| periodic_split_compact_group_interval_sec | | 10 | +| periodic_tombstone_reclaim_compaction_interval_sec | | 600 | +| periodic_ttl_reclaim_compaction_interval_sec | Schedule ttl_reclaim compaction for all compaction groups with this interval. | 1800 | +| split_group_size_limit | | 68719476736 | +| table_write_throughput_threshold | | 16777216 | +| unrecognized | | | +| vacuum_interval_sec | Interval of invoking a vacuum job, to remove stale metadata from meta store and objects from object store. | 30 | +| vacuum_spin_interval_ms | The spin interval inside a vacuum job. It avoids the vacuum job monopolizing resources of meta node. | 10 | + +## meta.compaction_config + +| Config | Description | Default | +|--------|-------------|---------| +| compaction_filter_mask | | 6 | +| enable_emergency_picker | | true | +| level0_max_compact_file_number | | 100 | +| level0_overlapping_sub_level_compact_level_count | | 12 | +| level0_stop_write_threshold_sub_level_number | | 300 | +| level0_sub_level_compact_level_count | | 3 | +| level0_tier_compact_file_number | | 12 | +| max_bytes_for_level_base | | 536870912 | +| max_bytes_for_level_multiplier | | 5 | +| max_compaction_bytes | | 2147483648 | +| max_space_reclaim_bytes | | 536870912 | +| max_sub_compaction | | 4 | +| sub_level_max_compaction_bytes | | 134217728 | +| target_file_size_base | | 33554432 | +| tombstone_reclaim_ratio | | 40 | + +## server + +| Config | Description | Default | +|--------|-------------|---------| +| connection_pool_size | | 16 | +| grpc_max_reset_stream | | 200 | +| heap_profiling | Enable heap profile dump when memory usage is high. | | +| heartbeat_interval_ms | The interval for periodic heartbeat from worker to the meta service. | 1000 | +| metrics_level | Used for control the metrics level, similar to log level. | "Info" | +| telemetry_enabled | | true | + +## storage + +| Config | Description | Default | +|--------|-------------|---------| +| block_cache_capacity_mb | Capacity of sstable block cache. | | +| cache_refill | | | +| check_compaction_result | | false | +| compact_iter_recreate_timeout_ms | | 600000 | +| compactor_fast_max_compact_delete_ratio | | 40 | +| compactor_fast_max_compact_task_size | | 2147483648 | +| compactor_max_sst_key_count | | 2097152 | +| compactor_max_sst_size | | 536870912 | +| compactor_max_task_multiplier | Compactor calculates the maximum number of tasks that can be executed on the node based on worker_num and compactor_max_task_multiplier. max_pull_task_count = worker_num * compactor_max_task_multiplier | 2.5 | +| compactor_memory_available_proportion | The percentage of memory available when compactor is deployed separately. non_reserved_memory_bytes = system_memory_available_bytes * compactor_memory_available_proportion | 0.8 | +| compactor_memory_limit_mb | | | +| data_file_cache | | | +| disable_remote_compactor | | false | +| enable_fast_compaction | | false | +| high_priority_ratio_in_percent | | | +| imm_merge_threshold | The threshold for the number of immutable memtables to merge to a new imm. | 0 | +| max_concurrent_compaction_task_number | | 16 | +| max_prefetch_block_number | max prefetch block number | 16 | +| max_preload_io_retry_times | | 3 | +| max_preload_wait_time_mill | | 0 | +| max_sub_compaction | Max sub compaction task numbers | 4 | +| max_version_pinning_duration_sec | | 10800 | +| mem_table_spill_threshold | The spill threshold for mem table. | 4194304 | +| meta_cache_capacity_mb | Capacity of sstable meta cache. | | +| meta_file_cache | | | +| min_sst_size_for_streaming_upload | Whether to enable streaming upload for sstable. | 33554432 | +| object_store | | | +| prefetch_buffer_capacity_mb | max memory usage for large query | | +| share_buffer_compaction_worker_threads_number | Worker threads number of dedicated tokio runtime for share buffer compaction. 0 means use tokio's default value (number of CPU core). | 4 | +| share_buffer_upload_concurrency | Number of tasks shared buffer can upload in parallel. | 8 | +| share_buffers_sync_parallelism | parallelism while syncing share buffers into L0 SST. Should NOT be 0. | 1 | +| shared_buffer_capacity_mb | Maximum shared buffer size, writes attempting to exceed the capacity will stall until there is enough space. | | +| shared_buffer_flush_ratio | The shared buffer will start flushing data to object when the ratio of memory usage to the shared buffer capacity exceed such ratio. | 0.800000011920929 | +| sstable_id_remote_fetch_number | Number of SST ids fetched from meta per RPC | 10 | +| write_conflict_detection_enabled | Whether to enable write conflict detection | true | + +## streaming + +| Config | Description | Default | +|--------|-------------|---------| +| actor_runtime_worker_threads_num | The thread number of the streaming actor runtime in the compute node. The default value is decided by `tokio`. | | +| async_stack_trace | Enable async stack tracing through `await-tree` for risectl. | "ReleaseVerbose" | +| in_flight_barrier_nums | The maximum number of barriers in-flight in the compute nodes. | 10000 | +| unique_user_stream_errors | Max unique user stream errors per actor | 10 | + +## system + +| Config | Description | Default | +|--------|-------------|---------| +| backup_storage_directory | Remote directory for storing snapshots. | | +| backup_storage_url | Remote storage url for storing snapshots. | | +| barrier_interval_ms | The interval of periodic barrier. | 1000 | +| block_size_kb | Size of each block in bytes in SST. | 64 | +| bloom_false_positive | False positive probability of bloom filter. | 0.001 | +| checkpoint_frequency | There will be a checkpoint for every n barriers. | 1 | +| data_directory | Remote directory for storing data and metadata objects. | | +| enable_tracing | Whether to enable distributed tracing. | false | +| max_concurrent_creating_streaming_jobs | Max number of concurrent creating streaming jobs. | 1 | +| parallel_compact_size_mb | | 512 | +| pause_on_next_bootstrap | Whether to pause all data sources on next bootstrap. | false | +| sstable_size_mb | Target size of the Sstable. | 256 | +| state_store | | | diff --git a/src/config/example.toml b/src/config/example.toml index 6a6314d7832d2..954a9c0f12e38 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -26,6 +26,9 @@ min_delta_log_num_for_hummock_version_checkpoint = 10 max_heartbeat_interval_secs = 300 disable_recovery = false disable_automatic_parallelism_control = false +parallelism_control_batch_size = 10 +parallelism_control_trigger_period_sec = 10 +parallelism_control_trigger_first_delay_sec = 30 meta_leader_lease_secs = 30 default_parallelism = "Full" enable_compaction_deterministic = false diff --git a/src/meta/node/src/lib.rs b/src/meta/node/src/lib.rs index 867e2409c178f..b7f290ecff469 100644 --- a/src/meta/node/src/lib.rs +++ b/src/meta/node/src/lib.rs @@ -281,6 +281,13 @@ pub fn start(opts: MetaNodeOpts) -> Pin + Send>> { disable_automatic_parallelism_control: config .meta .disable_automatic_parallelism_control, + parallelism_control_batch_size: config.meta.parallelism_control_batch_size, + parallelism_control_trigger_period_sec: config + .meta + .parallelism_control_trigger_period_sec, + parallelism_control_trigger_first_delay_sec: config + .meta + .parallelism_control_trigger_first_delay_sec, in_flight_barrier_nums, max_idle_ms, compaction_deterministic_test: config.meta.enable_compaction_deterministic, diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index 6b1b73d6ca697..86cc3fa078784 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -615,7 +615,7 @@ impl CommandContext { actor_splits, actor_new_dispatchers, }); - tracing::debug!("update mutation: {mutation:#?}"); + tracing::debug!("update mutation: {mutation:?}"); Some(mutation) } }; diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs index 1208d2c49b58d..824cc8b0c090e 100644 --- a/src/meta/src/barrier/recovery.rs +++ b/src/meta/src/barrier/recovery.rs @@ -695,7 +695,7 @@ impl GlobalBarrierManagerContext { return Err(e); } - debug!("scaling-in actors succeed."); + debug!("scaling actors succeed."); Ok(()) } @@ -833,7 +833,7 @@ impl GlobalBarrierManagerContext { return Err(e); } - debug!("scaling-in actors succeed."); + debug!("scaling actors succeed."); Ok(()) } diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs index f10138b56e81e..ea26fea83ebcc 100644 --- a/src/meta/src/manager/env.rs +++ b/src/meta/src/manager/env.rs @@ -93,6 +93,12 @@ pub struct MetaOpts { pub enable_recovery: bool, /// Whether to disable the auto-scaling feature. pub disable_automatic_parallelism_control: bool, + /// The number of streaming jobs per scaling operation. + pub parallelism_control_batch_size: usize, + /// The period of parallelism control trigger. + pub parallelism_control_trigger_period_sec: u64, + /// The first delay of parallelism control. + pub parallelism_control_trigger_first_delay_sec: u64, /// The maximum number of barriers in-flight in the compute nodes. pub in_flight_barrier_nums: usize, /// After specified seconds of idle (no mview or flush), the process will be exited. @@ -221,6 +227,9 @@ impl MetaOpts { Self { enable_recovery, disable_automatic_parallelism_control: false, + parallelism_control_batch_size: 1, + parallelism_control_trigger_period_sec: 10, + parallelism_control_trigger_first_delay_sec: 30, in_flight_barrier_nums: 40, max_idle_ms: 0, compaction_deterministic_test: false, diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index 0e571a0afebf7..3b65c73d059cc 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -31,7 +31,9 @@ use risingwave_common::catalog::TableId; use risingwave_common::hash::{ActorMapping, ParallelUnitId, VirtualNode}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_meta_model_v2::StreamingParallelism; -use risingwave_pb::common::{ActorInfo, Buffer, ParallelUnit, ParallelUnitMapping, WorkerNode}; +use risingwave_pb::common::{ + ActorInfo, Buffer, ParallelUnit, ParallelUnitMapping, WorkerNode, WorkerType, +}; use risingwave_pb::meta::get_reschedule_plan_request::{Policy, StableResizePolicy}; use risingwave_pb::meta::subscribe_response::{Info, Operation}; use risingwave_pb::meta::table_fragments::actor_status::ActorState; @@ -48,7 +50,7 @@ use thiserror_ext::AsReport; use tokio::sync::oneshot::Receiver; use tokio::sync::{oneshot, RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::task::JoinHandle; -use tokio::time::MissedTickBehavior; +use tokio::time::{Instant, MissedTickBehavior}; use crate::barrier::{Command, Reschedule, StreamRpcManager}; use crate::manager::{IdCategory, LocalNotification, MetaSrvEnv, MetadataManager, WorkerId}; @@ -2676,12 +2678,14 @@ impl GlobalStreamManager { Ok(()) } - async fn trigger_parallelism_control(&self) -> MetaResult<()> { + async fn trigger_parallelism_control(&self) -> MetaResult { + tracing::info!("trigger parallelism control"); + let _reschedule_job_lock = self.reschedule_lock_write_guard().await; - match &self.metadata_manager { + let (schedulable_worker_ids, table_parallelisms) = match &self.metadata_manager { MetadataManager::V1(mgr) => { - let table_parallelisms = { + let table_parallelisms: HashMap = { let guard = mgr.fragment_manager.get_fragment_read_guard().await; guard @@ -2697,7 +2701,7 @@ impl GlobalStreamManager { .list_active_streaming_compute_nodes() .await; - let schedulable_worker_ids = workers + let schedulable_worker_ids: BTreeSet<_> = workers .iter() .filter(|worker| { !worker @@ -2709,26 +2713,7 @@ impl GlobalStreamManager { .map(|worker| worker.id) .collect(); - let reschedules = self - .scale_controller - .generate_table_resize_plan(TableResizePolicy { - worker_ids: schedulable_worker_ids, - table_parallelisms, - }) - .await?; - - if reschedules.is_empty() { - return Ok(()); - } - - self.reschedule_actors( - reschedules, - RescheduleOptions { - resolve_no_shuffle_upstream: true, - }, - None, - ) - .await?; + (schedulable_worker_ids, table_parallelisms) } MetadataManager::V2(mgr) => { let table_parallelisms: HashMap<_, _> = { @@ -2768,33 +2753,90 @@ impl GlobalStreamManager { .map(|worker| worker.id) .collect(); - let reschedules = self - .scale_controller - .generate_table_resize_plan(TableResizePolicy { - worker_ids: schedulable_worker_ids, - table_parallelisms: table_parallelisms.clone(), - }) - .await?; + (schedulable_worker_ids, table_parallelisms) + } + }; - if reschedules.is_empty() { - return Ok(()); - } + if table_parallelisms.is_empty() { + tracing::info!("no streaming jobs for scaling, maybe an empty cluster"); + return Ok(false); + } - self.reschedule_actors( - reschedules, - RescheduleOptions { - resolve_no_shuffle_upstream: true, - }, - None, - ) + let batch_size = match self.env.opts.parallelism_control_batch_size { + 0 => table_parallelisms.len(), + n => n, + }; + + tracing::info!( + "total {} streaming jobs, batch size {}, schedulable worker ids: {:?}", + table_parallelisms.len(), + batch_size, + schedulable_worker_ids + ); + + let batches: Vec<_> = table_parallelisms + .into_iter() + .chunks(batch_size) + .into_iter() + .map(|chunk| chunk.collect_vec()) + .collect(); + + let mut reschedules = None; + + for batch in batches { + let parallelisms: HashMap<_, _> = batch.into_iter().collect(); + + let plan = self + .scale_controller + .generate_table_resize_plan(TableResizePolicy { + worker_ids: schedulable_worker_ids.clone(), + table_parallelisms: parallelisms.clone(), + }) .await?; + + if !plan.is_empty() { + tracing::info!( + "reschedule plan generated for streaming jobs {:?}", + parallelisms + ); + reschedules = Some(plan); + break; } } - Ok(()) + let Some(reschedules) = reschedules else { + tracing::info!("no reschedule plan generated"); + return Ok(false); + }; + + self.reschedule_actors( + reschedules, + RescheduleOptions { + resolve_no_shuffle_upstream: false, + }, + None, + ) + .await?; + + Ok(true) } async fn run(&self, mut shutdown_rx: Receiver<()>) { + tracing::info!("starting automatic parallelism control monitor"); + + let check_period = + Duration::from_secs(self.env.opts.parallelism_control_trigger_period_sec); + + let mut ticker = tokio::time::interval_at( + Instant::now() + + Duration::from_secs(self.env.opts.parallelism_control_trigger_first_delay_sec), + check_period, + ); + ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + + // waiting for first tick + ticker.tick().await; + let (local_notification_tx, mut local_notification_rx) = tokio::sync::mpsc::unbounded_channel(); @@ -2803,11 +2845,6 @@ impl GlobalStreamManager { .insert_local_sender(local_notification_tx) .await; - let check_period = Duration::from_secs(10); - let mut ticker = tokio::time::interval(check_period); - ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); - ticker.reset(); - let worker_nodes = self .metadata_manager .list_active_streaming_compute_nodes() @@ -2819,7 +2856,7 @@ impl GlobalStreamManager { .map(|worker| (worker.id, worker)) .collect(); - let mut changed = true; + let mut should_trigger = false; loop { tokio::select! { @@ -2830,18 +2867,18 @@ impl GlobalStreamManager { break; } - _ = ticker.tick(), if changed => { + _ = ticker.tick(), if should_trigger => { let include_workers = worker_cache.keys().copied().collect_vec(); if include_workers.is_empty() { tracing::debug!("no available worker nodes"); - changed = false; + should_trigger = false; continue; } match self.trigger_parallelism_control().await { - Ok(_) => { - changed = false; + Ok(cont) => { + should_trigger = cont; } Err(e) => { tracing::warn!(error = %e.as_report(), "Failed to trigger scale out, waiting for next tick to retry after {}s", ticker.period().as_secs()); @@ -2855,13 +2892,26 @@ impl GlobalStreamManager { match notification { LocalNotification::WorkerNodeActivated(worker) => { + match (worker.get_type(), worker.property.as_ref()) { + (Ok(WorkerType::ComputeNode), Some(prop)) if prop.is_streaming => { + tracing::info!("worker {} activated notification received", worker.id); + } + _ => continue + } + let prev_worker = worker_cache.insert(worker.id, worker.clone()); - if let Some(prev_worker) = prev_worker && prev_worker.parallel_units != worker.parallel_units { - tracing::info!(worker = worker.id, "worker parallelism changed"); + match prev_worker { + Some(prev_worker) if prev_worker.parallel_units != worker.parallel_units => { + tracing::info!(worker = worker.id, "worker parallelism changed"); + should_trigger = true; + } + None => { + tracing::info!(worker = worker.id, "new worker joined"); + should_trigger = true; + } + _ => {} } - - changed = true; } // Since our logic for handling passive scale-in is within the barrier manager, diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs index b4178e01f8786..f3d2f0559e998 100644 --- a/src/tests/simulation/src/cluster.rs +++ b/src/tests/simulation/src/cluster.rs @@ -189,6 +189,9 @@ impl Configuration { r#"[meta] max_heartbeat_interval_secs = {max_heartbeat_interval_secs} disable_automatic_parallelism_control = {disable_automatic_parallelism_control} +parallelism_control_trigger_first_delay_sec = 0 +parallelism_control_batch_size = 0 +parallelism_control_trigger_period_sec = 10 [system] barrier_interval_ms = 250 From df5b5e9b1efb645d77cb5cdd9719b4e5438c7891 Mon Sep 17 00:00:00 2001 From: Shanicky Chen Date: Thu, 7 Mar 2024 13:18:25 +0800 Subject: [PATCH 2/2] remove doc Signed-off-by: Shanicky Chen --- src/config/docs.md | 155 --------------------------------------------- 1 file changed, 155 deletions(-) delete mode 100644 src/config/docs.md diff --git a/src/config/docs.md b/src/config/docs.md deleted file mode 100644 index cf28f0ca9e2c6..0000000000000 --- a/src/config/docs.md +++ /dev/null @@ -1,155 +0,0 @@ -# RisingWave System Configurations - -This page is automatically generated by `./risedev generate-example-config` - -## batch - -| Config | Description | Default | -|--------|-------------|---------| -| distributed_query_limit | | | -| enable_barrier_read | | false | -| frontend_compute_runtime_worker_threads | frontend compute runtime worker threads | 4 | -| statement_timeout_in_sec | Timeout for a batch query in seconds. | 3600 | -| worker_threads_num | The thread number of the batch task runtime in the compute node. The default value is decided by `tokio`. | | - -## meta - -| Config | Description | Default | -|--------|-------------|---------| -| backend | | "Mem" | -| collect_gc_watermark_spin_interval_sec | The spin interval when collecting global GC watermark in hummock. | 5 | -| compaction_task_max_heartbeat_interval_secs | | 30 | -| compaction_task_max_progress_interval_secs | | 600 | -| cut_table_size_limit | | 1073741824 | -| dangerous_max_idle_secs | After specified seconds of idle (no mview or flush), the process will be exited. It is mainly useful for playgrounds. | | -| default_parallelism | The default global parallelism for all streaming jobs, if user doesn't specify the parallelism, this value will be used. `FULL` means use all available parallelism units, otherwise it's a number. | "Full" | -| disable_automatic_parallelism_control | Whether to disable adaptive-scaling feature. | false | -| disable_recovery | Whether to enable fail-on-recovery. Should only be used in e2e tests. | false | -| do_not_config_object_storage_lifecycle | Whether config object storage bucket lifecycle to purge stale data. | false | -| enable_committed_sst_sanity_check | Enable sanity check when SSTs are committed. | false | -| enable_compaction_deterministic | Whether to enable deterministic compaction scheduling, which will disable all auto scheduling of compaction tasks. Should only be used in e2e tests. | false | -| enable_hummock_data_archive | If enabled, SSTable object file and version delta will be retained. SSTable object file need to be deleted via full GC. version delta need to be manually deleted. | false | -| event_log_channel_max_size | Keeps the latest N events per channel. | 10 | -| event_log_enabled | | true | -| full_gc_interval_sec | Interval of automatic hummock full GC. | 86400 | -| hummock_version_checkpoint_interval_sec | Interval of hummock version checkpoint. | 30 | -| hybird_partition_vnode_count | | 4 | -| max_heartbeat_interval_secs | Maximum allowed heartbeat interval in seconds. | 300 | -| meta_leader_lease_secs | | 30 | -| min_delta_log_num_for_hummock_version_checkpoint | The minimum delta log number a new checkpoint should compact, otherwise the checkpoint attempt is rejected. | 10 | -| min_sst_retention_time_sec | Objects within `min_sst_retention_time_sec` won't be deleted by hummock full GC, even they are dangling. | 86400 | -| min_table_split_write_throughput | If the size of one table is smaller than `min_table_split_write_throughput`, we would not split it to an single group. | 4194304 | -| move_table_size_limit | | 10737418240 | -| node_num_monitor_interval_sec | | 10 | -| parallelism_control_batch_size | The number of streaming jobs per scaling operation. | 10 | -| parallelism_control_trigger_first_delay_sec | The first delay of parallelism control. | 30 | -| parallelism_control_trigger_period_sec | The period of parallelism control trigger. | 10 | -| partition_vnode_count | | 16 | -| periodic_compaction_interval_sec | Schedule compaction for all compaction groups with this interval. | 60 | -| periodic_space_reclaim_compaction_interval_sec | Schedule space_reclaim compaction for all compaction groups with this interval. | 3600 | -| periodic_split_compact_group_interval_sec | | 10 | -| periodic_tombstone_reclaim_compaction_interval_sec | | 600 | -| periodic_ttl_reclaim_compaction_interval_sec | Schedule ttl_reclaim compaction for all compaction groups with this interval. | 1800 | -| split_group_size_limit | | 68719476736 | -| table_write_throughput_threshold | | 16777216 | -| unrecognized | | | -| vacuum_interval_sec | Interval of invoking a vacuum job, to remove stale metadata from meta store and objects from object store. | 30 | -| vacuum_spin_interval_ms | The spin interval inside a vacuum job. It avoids the vacuum job monopolizing resources of meta node. | 10 | - -## meta.compaction_config - -| Config | Description | Default | -|--------|-------------|---------| -| compaction_filter_mask | | 6 | -| enable_emergency_picker | | true | -| level0_max_compact_file_number | | 100 | -| level0_overlapping_sub_level_compact_level_count | | 12 | -| level0_stop_write_threshold_sub_level_number | | 300 | -| level0_sub_level_compact_level_count | | 3 | -| level0_tier_compact_file_number | | 12 | -| max_bytes_for_level_base | | 536870912 | -| max_bytes_for_level_multiplier | | 5 | -| max_compaction_bytes | | 2147483648 | -| max_space_reclaim_bytes | | 536870912 | -| max_sub_compaction | | 4 | -| sub_level_max_compaction_bytes | | 134217728 | -| target_file_size_base | | 33554432 | -| tombstone_reclaim_ratio | | 40 | - -## server - -| Config | Description | Default | -|--------|-------------|---------| -| connection_pool_size | | 16 | -| grpc_max_reset_stream | | 200 | -| heap_profiling | Enable heap profile dump when memory usage is high. | | -| heartbeat_interval_ms | The interval for periodic heartbeat from worker to the meta service. | 1000 | -| metrics_level | Used for control the metrics level, similar to log level. | "Info" | -| telemetry_enabled | | true | - -## storage - -| Config | Description | Default | -|--------|-------------|---------| -| block_cache_capacity_mb | Capacity of sstable block cache. | | -| cache_refill | | | -| check_compaction_result | | false | -| compact_iter_recreate_timeout_ms | | 600000 | -| compactor_fast_max_compact_delete_ratio | | 40 | -| compactor_fast_max_compact_task_size | | 2147483648 | -| compactor_max_sst_key_count | | 2097152 | -| compactor_max_sst_size | | 536870912 | -| compactor_max_task_multiplier | Compactor calculates the maximum number of tasks that can be executed on the node based on worker_num and compactor_max_task_multiplier. max_pull_task_count = worker_num * compactor_max_task_multiplier | 2.5 | -| compactor_memory_available_proportion | The percentage of memory available when compactor is deployed separately. non_reserved_memory_bytes = system_memory_available_bytes * compactor_memory_available_proportion | 0.8 | -| compactor_memory_limit_mb | | | -| data_file_cache | | | -| disable_remote_compactor | | false | -| enable_fast_compaction | | false | -| high_priority_ratio_in_percent | | | -| imm_merge_threshold | The threshold for the number of immutable memtables to merge to a new imm. | 0 | -| max_concurrent_compaction_task_number | | 16 | -| max_prefetch_block_number | max prefetch block number | 16 | -| max_preload_io_retry_times | | 3 | -| max_preload_wait_time_mill | | 0 | -| max_sub_compaction | Max sub compaction task numbers | 4 | -| max_version_pinning_duration_sec | | 10800 | -| mem_table_spill_threshold | The spill threshold for mem table. | 4194304 | -| meta_cache_capacity_mb | Capacity of sstable meta cache. | | -| meta_file_cache | | | -| min_sst_size_for_streaming_upload | Whether to enable streaming upload for sstable. | 33554432 | -| object_store | | | -| prefetch_buffer_capacity_mb | max memory usage for large query | | -| share_buffer_compaction_worker_threads_number | Worker threads number of dedicated tokio runtime for share buffer compaction. 0 means use tokio's default value (number of CPU core). | 4 | -| share_buffer_upload_concurrency | Number of tasks shared buffer can upload in parallel. | 8 | -| share_buffers_sync_parallelism | parallelism while syncing share buffers into L0 SST. Should NOT be 0. | 1 | -| shared_buffer_capacity_mb | Maximum shared buffer size, writes attempting to exceed the capacity will stall until there is enough space. | | -| shared_buffer_flush_ratio | The shared buffer will start flushing data to object when the ratio of memory usage to the shared buffer capacity exceed such ratio. | 0.800000011920929 | -| sstable_id_remote_fetch_number | Number of SST ids fetched from meta per RPC | 10 | -| write_conflict_detection_enabled | Whether to enable write conflict detection | true | - -## streaming - -| Config | Description | Default | -|--------|-------------|---------| -| actor_runtime_worker_threads_num | The thread number of the streaming actor runtime in the compute node. The default value is decided by `tokio`. | | -| async_stack_trace | Enable async stack tracing through `await-tree` for risectl. | "ReleaseVerbose" | -| in_flight_barrier_nums | The maximum number of barriers in-flight in the compute nodes. | 10000 | -| unique_user_stream_errors | Max unique user stream errors per actor | 10 | - -## system - -| Config | Description | Default | -|--------|-------------|---------| -| backup_storage_directory | Remote directory for storing snapshots. | | -| backup_storage_url | Remote storage url for storing snapshots. | | -| barrier_interval_ms | The interval of periodic barrier. | 1000 | -| block_size_kb | Size of each block in bytes in SST. | 64 | -| bloom_false_positive | False positive probability of bloom filter. | 0.001 | -| checkpoint_frequency | There will be a checkpoint for every n barriers. | 1 | -| data_directory | Remote directory for storing data and metadata objects. | | -| enable_tracing | Whether to enable distributed tracing. | false | -| max_concurrent_creating_streaming_jobs | Max number of concurrent creating streaming jobs. | 1 | -| parallel_compact_size_mb | | 512 | -| pause_on_next_bootstrap | Whether to pause all data sources on next bootstrap. | false | -| sstable_size_mb | Target size of the Sstable. | 256 | -| state_store | | |