From b251b4c5b2d2d90ab0f6fd1cafef0556191f0c46 Mon Sep 17 00:00:00 2001
From: Shanicky Chen <peng@risingwave-labs.com>
Date: Tue, 5 Mar 2024 18:25:36 +0800
Subject: [PATCH 1/2] feat: execute auto-scaling in batches (#15420)

---
 src/common/src/config.rs            |  24 ++++
 src/config/docs.md                  | 155 ++++++++++++++++++++++++++
 src/config/example.toml             |   3 +
 src/meta/node/src/lib.rs            |   7 ++
 src/meta/src/barrier/command.rs     |   2 +-
 src/meta/src/barrier/recovery.rs    |   4 +-
 src/meta/src/manager/env.rs         |   9 ++
 src/meta/src/stream/scale.rs        | 166 ++++++++++++++++++----------
 src/tests/simulation/src/cluster.rs |   3 +
 9 files changed, 312 insertions(+), 61 deletions(-)
 create mode 100644 src/config/docs.md

diff --git a/src/common/src/config.rs b/src/common/src/config.rs
index ea4354f9b23ad..2bae32def8d30 100644
--- a/src/common/src/config.rs
+++ b/src/common/src/config.rs
@@ -229,6 +229,18 @@ pub struct MetaConfig {
     #[serde(default)]
     pub disable_automatic_parallelism_control: bool,
 
+    /// The number of streaming jobs per scaling operation.
+    #[serde(default = "default::meta::parallelism_control_batch_size")]
+    pub parallelism_control_batch_size: usize,
+
+    /// The period of parallelism control trigger.
+    #[serde(default = "default::meta::parallelism_control_trigger_period_sec")]
+    pub parallelism_control_trigger_period_sec: u64,
+
+    /// The first delay of parallelism control.
+    #[serde(default = "default::meta::parallelism_control_trigger_first_delay_sec")]
+    pub parallelism_control_trigger_first_delay_sec: u64,
+
     #[serde(default = "default::meta::meta_leader_lease_secs")]
     pub meta_leader_lease_secs: u64,
 
@@ -1103,6 +1115,18 @@ pub mod default {
         pub fn event_log_channel_max_size() -> u32 {
             10
         }
+
+        pub fn parallelism_control_batch_size() -> usize {
+            10
+        }
+
+        pub fn parallelism_control_trigger_period_sec() -> u64 {
+            10
+        }
+
+        pub fn parallelism_control_trigger_first_delay_sec() -> u64 {
+            30
+        }
     }
 
     pub mod server {
diff --git a/src/config/docs.md b/src/config/docs.md
new file mode 100644
index 0000000000000..cf28f0ca9e2c6
--- /dev/null
+++ b/src/config/docs.md
@@ -0,0 +1,155 @@
+# RisingWave System Configurations
+
+This page is automatically generated by `./risedev generate-example-config`
+
+## batch
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| distributed_query_limit |  |  |
+| enable_barrier_read |  | false |
+| frontend_compute_runtime_worker_threads |  frontend compute runtime worker threads | 4 |
+| statement_timeout_in_sec |  Timeout for a batch query in seconds. | 3600 |
+| worker_threads_num |  The thread number of the batch task runtime in the compute node. The default value is  decided by `tokio`. |  |
+
+## meta
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| backend |  | "Mem" |
+| collect_gc_watermark_spin_interval_sec |  The spin interval when collecting global GC watermark in hummock. | 5 |
+| compaction_task_max_heartbeat_interval_secs |  | 30 |
+| compaction_task_max_progress_interval_secs |  | 600 |
+| cut_table_size_limit |  | 1073741824 |
+| dangerous_max_idle_secs |  After specified seconds of idle (no mview or flush), the process will be exited.  It is mainly useful for playgrounds. |  |
+| default_parallelism |  The default global parallelism for all streaming jobs, if user doesn't specify the  parallelism, this value will be used. `FULL` means use all available parallelism units,  otherwise it's a number. | "Full" |
+| disable_automatic_parallelism_control |  Whether to disable adaptive-scaling feature. | false |
+| disable_recovery |  Whether to enable fail-on-recovery. Should only be used in e2e tests. | false |
+| do_not_config_object_storage_lifecycle |  Whether config object storage bucket lifecycle to purge stale data. | false |
+| enable_committed_sst_sanity_check |  Enable sanity check when SSTs are committed. | false |
+| enable_compaction_deterministic |  Whether to enable deterministic compaction scheduling, which  will disable all auto scheduling of compaction tasks.  Should only be used in e2e tests. | false |
+| enable_hummock_data_archive |  If enabled, SSTable object file and version delta will be retained.   SSTable object file need to be deleted via full GC.   version delta need to be manually deleted. | false |
+| event_log_channel_max_size |  Keeps the latest N events per channel. | 10 |
+| event_log_enabled |  | true |
+| full_gc_interval_sec |  Interval of automatic hummock full GC. | 86400 |
+| hummock_version_checkpoint_interval_sec |  Interval of hummock version checkpoint. | 30 |
+| hybird_partition_vnode_count |  | 4 |
+| max_heartbeat_interval_secs |  Maximum allowed heartbeat interval in seconds. | 300 |
+| meta_leader_lease_secs |  | 30 |
+| min_delta_log_num_for_hummock_version_checkpoint |  The minimum delta log number a new checkpoint should compact, otherwise the checkpoint  attempt is rejected. | 10 |
+| min_sst_retention_time_sec |  Objects within `min_sst_retention_time_sec` won't be deleted by hummock full GC, even they  are dangling. | 86400 |
+| min_table_split_write_throughput |  If the size of one table is smaller than `min_table_split_write_throughput`, we would not  split it to an single group. | 4194304 |
+| move_table_size_limit |  | 10737418240 |
+| node_num_monitor_interval_sec |  | 10 |
+| parallelism_control_batch_size |  The number of streaming jobs per scaling operation. | 10 |
+| parallelism_control_trigger_first_delay_sec |  The first delay of parallelism control. | 30 |
+| parallelism_control_trigger_period_sec |  The period of parallelism control trigger. | 10 |
+| partition_vnode_count |  | 16 |
+| periodic_compaction_interval_sec |  Schedule compaction for all compaction groups with this interval. | 60 |
+| periodic_space_reclaim_compaction_interval_sec |  Schedule space_reclaim compaction for all compaction groups with this interval. | 3600 |
+| periodic_split_compact_group_interval_sec |  | 10 |
+| periodic_tombstone_reclaim_compaction_interval_sec |  | 600 |
+| periodic_ttl_reclaim_compaction_interval_sec |  Schedule ttl_reclaim compaction for all compaction groups with this interval. | 1800 |
+| split_group_size_limit |  | 68719476736 |
+| table_write_throughput_threshold |  | 16777216 |
+| unrecognized |  |  |
+| vacuum_interval_sec |  Interval of invoking a vacuum job, to remove stale metadata from meta store and objects  from object store. | 30 |
+| vacuum_spin_interval_ms |  The spin interval inside a vacuum job. It avoids the vacuum job monopolizing resources of  meta node. | 10 |
+
+## meta.compaction_config
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| compaction_filter_mask |  | 6 |
+| enable_emergency_picker |  | true |
+| level0_max_compact_file_number |  | 100 |
+| level0_overlapping_sub_level_compact_level_count |  | 12 |
+| level0_stop_write_threshold_sub_level_number |  | 300 |
+| level0_sub_level_compact_level_count |  | 3 |
+| level0_tier_compact_file_number |  | 12 |
+| max_bytes_for_level_base |  | 536870912 |
+| max_bytes_for_level_multiplier |  | 5 |
+| max_compaction_bytes |  | 2147483648 |
+| max_space_reclaim_bytes |  | 536870912 |
+| max_sub_compaction |  | 4 |
+| sub_level_max_compaction_bytes |  | 134217728 |
+| target_file_size_base |  | 33554432 |
+| tombstone_reclaim_ratio |  | 40 |
+
+## server
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| connection_pool_size |  | 16 |
+| grpc_max_reset_stream |  | 200 |
+| heap_profiling |  Enable heap profile dump when memory usage is high. |  |
+| heartbeat_interval_ms |  The interval for periodic heartbeat from worker to the meta service. | 1000 |
+| metrics_level |  Used for control the metrics level, similar to log level. | "Info" |
+| telemetry_enabled |  | true |
+
+## storage
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| block_cache_capacity_mb |  Capacity of sstable block cache. |  |
+| cache_refill |  |  |
+| check_compaction_result |  | false |
+| compact_iter_recreate_timeout_ms |  | 600000 |
+| compactor_fast_max_compact_delete_ratio |  | 40 |
+| compactor_fast_max_compact_task_size |  | 2147483648 |
+| compactor_max_sst_key_count |  | 2097152 |
+| compactor_max_sst_size |  | 536870912 |
+| compactor_max_task_multiplier |  Compactor calculates the maximum number of tasks that can be executed on the node based on  worker_num and compactor_max_task_multiplier.  max_pull_task_count = worker_num * compactor_max_task_multiplier | 2.5 |
+| compactor_memory_available_proportion |  The percentage of memory available when compactor is deployed separately.  non_reserved_memory_bytes = system_memory_available_bytes * compactor_memory_available_proportion | 0.8 |
+| compactor_memory_limit_mb |  |  |
+| data_file_cache |  |  |
+| disable_remote_compactor |  | false |
+| enable_fast_compaction |  | false |
+| high_priority_ratio_in_percent |  |  |
+| imm_merge_threshold |  The threshold for the number of immutable memtables to merge to a new imm. | 0 |
+| max_concurrent_compaction_task_number |  | 16 |
+| max_prefetch_block_number |  max prefetch block number | 16 |
+| max_preload_io_retry_times |  | 3 |
+| max_preload_wait_time_mill |  | 0 |
+| max_sub_compaction |  Max sub compaction task numbers | 4 |
+| max_version_pinning_duration_sec |  | 10800 |
+| mem_table_spill_threshold |  The spill threshold for mem table. | 4194304 |
+| meta_cache_capacity_mb |  Capacity of sstable meta cache. |  |
+| meta_file_cache |  |  |
+| min_sst_size_for_streaming_upload |  Whether to enable streaming upload for sstable. | 33554432 |
+| object_store |  |  |
+| prefetch_buffer_capacity_mb |  max memory usage for large query |  |
+| share_buffer_compaction_worker_threads_number |  Worker threads number of dedicated tokio runtime for share buffer compaction. 0 means use  tokio's default value (number of CPU core). | 4 |
+| share_buffer_upload_concurrency |  Number of tasks shared buffer can upload in parallel. | 8 |
+| share_buffers_sync_parallelism |  parallelism while syncing share buffers into L0 SST. Should NOT be 0. | 1 |
+| shared_buffer_capacity_mb |  Maximum shared buffer size, writes attempting to exceed the capacity will stall until there  is enough space. |  |
+| shared_buffer_flush_ratio |  The shared buffer will start flushing data to object when the ratio of memory usage to the  shared buffer capacity exceed such ratio. | 0.800000011920929 |
+| sstable_id_remote_fetch_number |  Number of SST ids fetched from meta per RPC | 10 |
+| write_conflict_detection_enabled |  Whether to enable write conflict detection | true |
+
+## streaming
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| actor_runtime_worker_threads_num |  The thread number of the streaming actor runtime in the compute node. The default value is  decided by `tokio`. |  |
+| async_stack_trace |  Enable async stack tracing through `await-tree` for risectl. | "ReleaseVerbose" |
+| in_flight_barrier_nums |  The maximum number of barriers in-flight in the compute nodes. | 10000 |
+| unique_user_stream_errors |  Max unique user stream errors per actor | 10 |
+
+## system
+
+| Config | Description | Default |
+|--------|-------------|---------|
+| backup_storage_directory | Remote directory for storing snapshots. |  |
+| backup_storage_url | Remote storage url for storing snapshots. |  |
+| barrier_interval_ms | The interval of periodic barrier. | 1000 |
+| block_size_kb | Size of each block in bytes in SST. | 64 |
+| bloom_false_positive | False positive probability of bloom filter. | 0.001 |
+| checkpoint_frequency | There will be a checkpoint for every n barriers. | 1 |
+| data_directory | Remote directory for storing data and metadata objects. |  |
+| enable_tracing | Whether to enable distributed tracing. | false |
+| max_concurrent_creating_streaming_jobs | Max number of concurrent creating streaming jobs. | 1 |
+| parallel_compact_size_mb |  | 512 |
+| pause_on_next_bootstrap | Whether to pause all data sources on next bootstrap. | false |
+| sstable_size_mb | Target size of the Sstable. | 256 |
+| state_store |  |  |
diff --git a/src/config/example.toml b/src/config/example.toml
index 6a6314d7832d2..954a9c0f12e38 100644
--- a/src/config/example.toml
+++ b/src/config/example.toml
@@ -26,6 +26,9 @@ min_delta_log_num_for_hummock_version_checkpoint = 10
 max_heartbeat_interval_secs = 300
 disable_recovery = false
 disable_automatic_parallelism_control = false
+parallelism_control_batch_size = 10
+parallelism_control_trigger_period_sec = 10
+parallelism_control_trigger_first_delay_sec = 30
 meta_leader_lease_secs = 30
 default_parallelism = "Full"
 enable_compaction_deterministic = false
diff --git a/src/meta/node/src/lib.rs b/src/meta/node/src/lib.rs
index 867e2409c178f..b7f290ecff469 100644
--- a/src/meta/node/src/lib.rs
+++ b/src/meta/node/src/lib.rs
@@ -281,6 +281,13 @@ pub fn start(opts: MetaNodeOpts) -> Pin<Box<dyn Future<Output = ()> + Send>> {
                 disable_automatic_parallelism_control: config
                     .meta
                     .disable_automatic_parallelism_control,
+                parallelism_control_batch_size: config.meta.parallelism_control_batch_size,
+                parallelism_control_trigger_period_sec: config
+                    .meta
+                    .parallelism_control_trigger_period_sec,
+                parallelism_control_trigger_first_delay_sec: config
+                    .meta
+                    .parallelism_control_trigger_first_delay_sec,
                 in_flight_barrier_nums,
                 max_idle_ms,
                 compaction_deterministic_test: config.meta.enable_compaction_deterministic,
diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs
index 6b1b73d6ca697..86cc3fa078784 100644
--- a/src/meta/src/barrier/command.rs
+++ b/src/meta/src/barrier/command.rs
@@ -615,7 +615,7 @@ impl CommandContext {
                         actor_splits,
                         actor_new_dispatchers,
                     });
-                    tracing::debug!("update mutation: {mutation:#?}");
+                    tracing::debug!("update mutation: {mutation:?}");
                     Some(mutation)
                 }
             };
diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs
index 1208d2c49b58d..824cc8b0c090e 100644
--- a/src/meta/src/barrier/recovery.rs
+++ b/src/meta/src/barrier/recovery.rs
@@ -695,7 +695,7 @@ impl GlobalBarrierManagerContext {
             return Err(e);
         }
 
-        debug!("scaling-in actors succeed.");
+        debug!("scaling actors succeed.");
         Ok(())
     }
 
@@ -833,7 +833,7 @@ impl GlobalBarrierManagerContext {
             return Err(e);
         }
 
-        debug!("scaling-in actors succeed.");
+        debug!("scaling actors succeed.");
         Ok(())
     }
 
diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs
index f10138b56e81e..ea26fea83ebcc 100644
--- a/src/meta/src/manager/env.rs
+++ b/src/meta/src/manager/env.rs
@@ -93,6 +93,12 @@ pub struct MetaOpts {
     pub enable_recovery: bool,
     /// Whether to disable the auto-scaling feature.
     pub disable_automatic_parallelism_control: bool,
+    /// The number of streaming jobs per scaling operation.
+    pub parallelism_control_batch_size: usize,
+    /// The period of parallelism control trigger.
+    pub parallelism_control_trigger_period_sec: u64,
+    /// The first delay of parallelism control.
+    pub parallelism_control_trigger_first_delay_sec: u64,
     /// The maximum number of barriers in-flight in the compute nodes.
     pub in_flight_barrier_nums: usize,
     /// After specified seconds of idle (no mview or flush), the process will be exited.
@@ -221,6 +227,9 @@ impl MetaOpts {
         Self {
             enable_recovery,
             disable_automatic_parallelism_control: false,
+            parallelism_control_batch_size: 1,
+            parallelism_control_trigger_period_sec: 10,
+            parallelism_control_trigger_first_delay_sec: 30,
             in_flight_barrier_nums: 40,
             max_idle_ms: 0,
             compaction_deterministic_test: false,
diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs
index 0e571a0afebf7..3b65c73d059cc 100644
--- a/src/meta/src/stream/scale.rs
+++ b/src/meta/src/stream/scale.rs
@@ -31,7 +31,9 @@ use risingwave_common::catalog::TableId;
 use risingwave_common::hash::{ActorMapping, ParallelUnitId, VirtualNode};
 use risingwave_common::util::iter_util::ZipEqDebug;
 use risingwave_meta_model_v2::StreamingParallelism;
-use risingwave_pb::common::{ActorInfo, Buffer, ParallelUnit, ParallelUnitMapping, WorkerNode};
+use risingwave_pb::common::{
+    ActorInfo, Buffer, ParallelUnit, ParallelUnitMapping, WorkerNode, WorkerType,
+};
 use risingwave_pb::meta::get_reschedule_plan_request::{Policy, StableResizePolicy};
 use risingwave_pb::meta::subscribe_response::{Info, Operation};
 use risingwave_pb::meta::table_fragments::actor_status::ActorState;
@@ -48,7 +50,7 @@ use thiserror_ext::AsReport;
 use tokio::sync::oneshot::Receiver;
 use tokio::sync::{oneshot, RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::task::JoinHandle;
-use tokio::time::MissedTickBehavior;
+use tokio::time::{Instant, MissedTickBehavior};
 
 use crate::barrier::{Command, Reschedule, StreamRpcManager};
 use crate::manager::{IdCategory, LocalNotification, MetaSrvEnv, MetadataManager, WorkerId};
@@ -2676,12 +2678,14 @@ impl GlobalStreamManager {
         Ok(())
     }
 
-    async fn trigger_parallelism_control(&self) -> MetaResult<()> {
+    async fn trigger_parallelism_control(&self) -> MetaResult<bool> {
+        tracing::info!("trigger parallelism control");
+
         let _reschedule_job_lock = self.reschedule_lock_write_guard().await;
 
-        match &self.metadata_manager {
+        let (schedulable_worker_ids, table_parallelisms) = match &self.metadata_manager {
             MetadataManager::V1(mgr) => {
-                let table_parallelisms = {
+                let table_parallelisms: HashMap<u32, TableParallelism> = {
                     let guard = mgr.fragment_manager.get_fragment_read_guard().await;
 
                     guard
@@ -2697,7 +2701,7 @@ impl GlobalStreamManager {
                     .list_active_streaming_compute_nodes()
                     .await;
 
-                let schedulable_worker_ids = workers
+                let schedulable_worker_ids: BTreeSet<_> = workers
                     .iter()
                     .filter(|worker| {
                         !worker
@@ -2709,26 +2713,7 @@ impl GlobalStreamManager {
                     .map(|worker| worker.id)
                     .collect();
 
-                let reschedules = self
-                    .scale_controller
-                    .generate_table_resize_plan(TableResizePolicy {
-                        worker_ids: schedulable_worker_ids,
-                        table_parallelisms,
-                    })
-                    .await?;
-
-                if reschedules.is_empty() {
-                    return Ok(());
-                }
-
-                self.reschedule_actors(
-                    reschedules,
-                    RescheduleOptions {
-                        resolve_no_shuffle_upstream: true,
-                    },
-                    None,
-                )
-                .await?;
+                (schedulable_worker_ids, table_parallelisms)
             }
             MetadataManager::V2(mgr) => {
                 let table_parallelisms: HashMap<_, _> = {
@@ -2768,33 +2753,90 @@ impl GlobalStreamManager {
                     .map(|worker| worker.id)
                     .collect();
 
-                let reschedules = self
-                    .scale_controller
-                    .generate_table_resize_plan(TableResizePolicy {
-                        worker_ids: schedulable_worker_ids,
-                        table_parallelisms: table_parallelisms.clone(),
-                    })
-                    .await?;
+                (schedulable_worker_ids, table_parallelisms)
+            }
+        };
 
-                if reschedules.is_empty() {
-                    return Ok(());
-                }
+        if table_parallelisms.is_empty() {
+            tracing::info!("no streaming jobs for scaling, maybe an empty cluster");
+            return Ok(false);
+        }
 
-                self.reschedule_actors(
-                    reschedules,
-                    RescheduleOptions {
-                        resolve_no_shuffle_upstream: true,
-                    },
-                    None,
-                )
+        let batch_size = match self.env.opts.parallelism_control_batch_size {
+            0 => table_parallelisms.len(),
+            n => n,
+        };
+
+        tracing::info!(
+            "total {} streaming jobs, batch size {}, schedulable worker ids: {:?}",
+            table_parallelisms.len(),
+            batch_size,
+            schedulable_worker_ids
+        );
+
+        let batches: Vec<_> = table_parallelisms
+            .into_iter()
+            .chunks(batch_size)
+            .into_iter()
+            .map(|chunk| chunk.collect_vec())
+            .collect();
+
+        let mut reschedules = None;
+
+        for batch in batches {
+            let parallelisms: HashMap<_, _> = batch.into_iter().collect();
+
+            let plan = self
+                .scale_controller
+                .generate_table_resize_plan(TableResizePolicy {
+                    worker_ids: schedulable_worker_ids.clone(),
+                    table_parallelisms: parallelisms.clone(),
+                })
                 .await?;
+
+            if !plan.is_empty() {
+                tracing::info!(
+                    "reschedule plan generated for streaming jobs {:?}",
+                    parallelisms
+                );
+                reschedules = Some(plan);
+                break;
             }
         }
 
-        Ok(())
+        let Some(reschedules) = reschedules else {
+            tracing::info!("no reschedule plan generated");
+            return Ok(false);
+        };
+
+        self.reschedule_actors(
+            reschedules,
+            RescheduleOptions {
+                resolve_no_shuffle_upstream: false,
+            },
+            None,
+        )
+        .await?;
+
+        Ok(true)
     }
 
     async fn run(&self, mut shutdown_rx: Receiver<()>) {
+        tracing::info!("starting automatic parallelism control monitor");
+
+        let check_period =
+            Duration::from_secs(self.env.opts.parallelism_control_trigger_period_sec);
+
+        let mut ticker = tokio::time::interval_at(
+            Instant::now()
+                + Duration::from_secs(self.env.opts.parallelism_control_trigger_first_delay_sec),
+            check_period,
+        );
+        ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        // waiting for first tick
+        ticker.tick().await;
+
         let (local_notification_tx, mut local_notification_rx) =
             tokio::sync::mpsc::unbounded_channel();
 
@@ -2803,11 +2845,6 @@ impl GlobalStreamManager {
             .insert_local_sender(local_notification_tx)
             .await;
 
-        let check_period = Duration::from_secs(10);
-        let mut ticker = tokio::time::interval(check_period);
-        ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
-        ticker.reset();
-
         let worker_nodes = self
             .metadata_manager
             .list_active_streaming_compute_nodes()
@@ -2819,7 +2856,7 @@ impl GlobalStreamManager {
             .map(|worker| (worker.id, worker))
             .collect();
 
-        let mut changed = true;
+        let mut should_trigger = false;
 
         loop {
             tokio::select! {
@@ -2830,18 +2867,18 @@ impl GlobalStreamManager {
                     break;
                 }
 
-                _ = ticker.tick(), if changed => {
+                _ = ticker.tick(), if should_trigger => {
                     let include_workers = worker_cache.keys().copied().collect_vec();
 
                     if include_workers.is_empty() {
                         tracing::debug!("no available worker nodes");
-                        changed = false;
+                        should_trigger = false;
                         continue;
                     }
 
                     match self.trigger_parallelism_control().await {
-                        Ok(_) => {
-                            changed = false;
+                        Ok(cont) => {
+                            should_trigger = cont;
                         }
                         Err(e) => {
                             tracing::warn!(error = %e.as_report(), "Failed to trigger scale out, waiting for next tick to retry after {}s", ticker.period().as_secs());
@@ -2855,13 +2892,26 @@ impl GlobalStreamManager {
 
                     match notification {
                         LocalNotification::WorkerNodeActivated(worker) => {
+                            match (worker.get_type(), worker.property.as_ref()) {
+                                (Ok(WorkerType::ComputeNode), Some(prop)) if prop.is_streaming => {
+                                    tracing::info!("worker {} activated notification received", worker.id);
+                                }
+                                _ => continue
+                            }
+
                             let prev_worker = worker_cache.insert(worker.id, worker.clone());
 
-                            if let Some(prev_worker) = prev_worker && prev_worker.parallel_units != worker.parallel_units {
-                                tracing::info!(worker = worker.id, "worker parallelism changed");
+                            match prev_worker {
+                                Some(prev_worker) if prev_worker.parallel_units != worker.parallel_units  => {
+                                    tracing::info!(worker = worker.id, "worker parallelism changed");
+                                    should_trigger = true;
+                                }
+                                None => {
+                                    tracing::info!(worker = worker.id, "new worker joined");
+                                    should_trigger = true;
+                                }
+                                _ => {}
                             }
-
-                            changed = true;
                         }
 
                         // Since our logic for handling passive scale-in is within the barrier manager,
diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs
index b4178e01f8786..f3d2f0559e998 100644
--- a/src/tests/simulation/src/cluster.rs
+++ b/src/tests/simulation/src/cluster.rs
@@ -189,6 +189,9 @@ impl Configuration {
                 r#"[meta]
 max_heartbeat_interval_secs = {max_heartbeat_interval_secs}
 disable_automatic_parallelism_control = {disable_automatic_parallelism_control}
+parallelism_control_trigger_first_delay_sec = 0
+parallelism_control_batch_size = 0
+parallelism_control_trigger_period_sec = 10
 
 [system]
 barrier_interval_ms = 250

From df5b5e9b1efb645d77cb5cdd9719b4e5438c7891 Mon Sep 17 00:00:00 2001
From: Shanicky Chen <peng@risingwave-labs.com>
Date: Thu, 7 Mar 2024 13:18:25 +0800
Subject: [PATCH 2/2] remove doc

Signed-off-by: Shanicky Chen <peng@risingwave-labs.com>
---
 src/config/docs.md | 155 ---------------------------------------------
 1 file changed, 155 deletions(-)
 delete mode 100644 src/config/docs.md

diff --git a/src/config/docs.md b/src/config/docs.md
deleted file mode 100644
index cf28f0ca9e2c6..0000000000000
--- a/src/config/docs.md
+++ /dev/null
@@ -1,155 +0,0 @@
-# RisingWave System Configurations
-
-This page is automatically generated by `./risedev generate-example-config`
-
-## batch
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| distributed_query_limit |  |  |
-| enable_barrier_read |  | false |
-| frontend_compute_runtime_worker_threads |  frontend compute runtime worker threads | 4 |
-| statement_timeout_in_sec |  Timeout for a batch query in seconds. | 3600 |
-| worker_threads_num |  The thread number of the batch task runtime in the compute node. The default value is  decided by `tokio`. |  |
-
-## meta
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| backend |  | "Mem" |
-| collect_gc_watermark_spin_interval_sec |  The spin interval when collecting global GC watermark in hummock. | 5 |
-| compaction_task_max_heartbeat_interval_secs |  | 30 |
-| compaction_task_max_progress_interval_secs |  | 600 |
-| cut_table_size_limit |  | 1073741824 |
-| dangerous_max_idle_secs |  After specified seconds of idle (no mview or flush), the process will be exited.  It is mainly useful for playgrounds. |  |
-| default_parallelism |  The default global parallelism for all streaming jobs, if user doesn't specify the  parallelism, this value will be used. `FULL` means use all available parallelism units,  otherwise it's a number. | "Full" |
-| disable_automatic_parallelism_control |  Whether to disable adaptive-scaling feature. | false |
-| disable_recovery |  Whether to enable fail-on-recovery. Should only be used in e2e tests. | false |
-| do_not_config_object_storage_lifecycle |  Whether config object storage bucket lifecycle to purge stale data. | false |
-| enable_committed_sst_sanity_check |  Enable sanity check when SSTs are committed. | false |
-| enable_compaction_deterministic |  Whether to enable deterministic compaction scheduling, which  will disable all auto scheduling of compaction tasks.  Should only be used in e2e tests. | false |
-| enable_hummock_data_archive |  If enabled, SSTable object file and version delta will be retained.   SSTable object file need to be deleted via full GC.   version delta need to be manually deleted. | false |
-| event_log_channel_max_size |  Keeps the latest N events per channel. | 10 |
-| event_log_enabled |  | true |
-| full_gc_interval_sec |  Interval of automatic hummock full GC. | 86400 |
-| hummock_version_checkpoint_interval_sec |  Interval of hummock version checkpoint. | 30 |
-| hybird_partition_vnode_count |  | 4 |
-| max_heartbeat_interval_secs |  Maximum allowed heartbeat interval in seconds. | 300 |
-| meta_leader_lease_secs |  | 30 |
-| min_delta_log_num_for_hummock_version_checkpoint |  The minimum delta log number a new checkpoint should compact, otherwise the checkpoint  attempt is rejected. | 10 |
-| min_sst_retention_time_sec |  Objects within `min_sst_retention_time_sec` won't be deleted by hummock full GC, even they  are dangling. | 86400 |
-| min_table_split_write_throughput |  If the size of one table is smaller than `min_table_split_write_throughput`, we would not  split it to an single group. | 4194304 |
-| move_table_size_limit |  | 10737418240 |
-| node_num_monitor_interval_sec |  | 10 |
-| parallelism_control_batch_size |  The number of streaming jobs per scaling operation. | 10 |
-| parallelism_control_trigger_first_delay_sec |  The first delay of parallelism control. | 30 |
-| parallelism_control_trigger_period_sec |  The period of parallelism control trigger. | 10 |
-| partition_vnode_count |  | 16 |
-| periodic_compaction_interval_sec |  Schedule compaction for all compaction groups with this interval. | 60 |
-| periodic_space_reclaim_compaction_interval_sec |  Schedule space_reclaim compaction for all compaction groups with this interval. | 3600 |
-| periodic_split_compact_group_interval_sec |  | 10 |
-| periodic_tombstone_reclaim_compaction_interval_sec |  | 600 |
-| periodic_ttl_reclaim_compaction_interval_sec |  Schedule ttl_reclaim compaction for all compaction groups with this interval. | 1800 |
-| split_group_size_limit |  | 68719476736 |
-| table_write_throughput_threshold |  | 16777216 |
-| unrecognized |  |  |
-| vacuum_interval_sec |  Interval of invoking a vacuum job, to remove stale metadata from meta store and objects  from object store. | 30 |
-| vacuum_spin_interval_ms |  The spin interval inside a vacuum job. It avoids the vacuum job monopolizing resources of  meta node. | 10 |
-
-## meta.compaction_config
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| compaction_filter_mask |  | 6 |
-| enable_emergency_picker |  | true |
-| level0_max_compact_file_number |  | 100 |
-| level0_overlapping_sub_level_compact_level_count |  | 12 |
-| level0_stop_write_threshold_sub_level_number |  | 300 |
-| level0_sub_level_compact_level_count |  | 3 |
-| level0_tier_compact_file_number |  | 12 |
-| max_bytes_for_level_base |  | 536870912 |
-| max_bytes_for_level_multiplier |  | 5 |
-| max_compaction_bytes |  | 2147483648 |
-| max_space_reclaim_bytes |  | 536870912 |
-| max_sub_compaction |  | 4 |
-| sub_level_max_compaction_bytes |  | 134217728 |
-| target_file_size_base |  | 33554432 |
-| tombstone_reclaim_ratio |  | 40 |
-
-## server
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| connection_pool_size |  | 16 |
-| grpc_max_reset_stream |  | 200 |
-| heap_profiling |  Enable heap profile dump when memory usage is high. |  |
-| heartbeat_interval_ms |  The interval for periodic heartbeat from worker to the meta service. | 1000 |
-| metrics_level |  Used for control the metrics level, similar to log level. | "Info" |
-| telemetry_enabled |  | true |
-
-## storage
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| block_cache_capacity_mb |  Capacity of sstable block cache. |  |
-| cache_refill |  |  |
-| check_compaction_result |  | false |
-| compact_iter_recreate_timeout_ms |  | 600000 |
-| compactor_fast_max_compact_delete_ratio |  | 40 |
-| compactor_fast_max_compact_task_size |  | 2147483648 |
-| compactor_max_sst_key_count |  | 2097152 |
-| compactor_max_sst_size |  | 536870912 |
-| compactor_max_task_multiplier |  Compactor calculates the maximum number of tasks that can be executed on the node based on  worker_num and compactor_max_task_multiplier.  max_pull_task_count = worker_num * compactor_max_task_multiplier | 2.5 |
-| compactor_memory_available_proportion |  The percentage of memory available when compactor is deployed separately.  non_reserved_memory_bytes = system_memory_available_bytes * compactor_memory_available_proportion | 0.8 |
-| compactor_memory_limit_mb |  |  |
-| data_file_cache |  |  |
-| disable_remote_compactor |  | false |
-| enable_fast_compaction |  | false |
-| high_priority_ratio_in_percent |  |  |
-| imm_merge_threshold |  The threshold for the number of immutable memtables to merge to a new imm. | 0 |
-| max_concurrent_compaction_task_number |  | 16 |
-| max_prefetch_block_number |  max prefetch block number | 16 |
-| max_preload_io_retry_times |  | 3 |
-| max_preload_wait_time_mill |  | 0 |
-| max_sub_compaction |  Max sub compaction task numbers | 4 |
-| max_version_pinning_duration_sec |  | 10800 |
-| mem_table_spill_threshold |  The spill threshold for mem table. | 4194304 |
-| meta_cache_capacity_mb |  Capacity of sstable meta cache. |  |
-| meta_file_cache |  |  |
-| min_sst_size_for_streaming_upload |  Whether to enable streaming upload for sstable. | 33554432 |
-| object_store |  |  |
-| prefetch_buffer_capacity_mb |  max memory usage for large query |  |
-| share_buffer_compaction_worker_threads_number |  Worker threads number of dedicated tokio runtime for share buffer compaction. 0 means use  tokio's default value (number of CPU core). | 4 |
-| share_buffer_upload_concurrency |  Number of tasks shared buffer can upload in parallel. | 8 |
-| share_buffers_sync_parallelism |  parallelism while syncing share buffers into L0 SST. Should NOT be 0. | 1 |
-| shared_buffer_capacity_mb |  Maximum shared buffer size, writes attempting to exceed the capacity will stall until there  is enough space. |  |
-| shared_buffer_flush_ratio |  The shared buffer will start flushing data to object when the ratio of memory usage to the  shared buffer capacity exceed such ratio. | 0.800000011920929 |
-| sstable_id_remote_fetch_number |  Number of SST ids fetched from meta per RPC | 10 |
-| write_conflict_detection_enabled |  Whether to enable write conflict detection | true |
-
-## streaming
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| actor_runtime_worker_threads_num |  The thread number of the streaming actor runtime in the compute node. The default value is  decided by `tokio`. |  |
-| async_stack_trace |  Enable async stack tracing through `await-tree` for risectl. | "ReleaseVerbose" |
-| in_flight_barrier_nums |  The maximum number of barriers in-flight in the compute nodes. | 10000 |
-| unique_user_stream_errors |  Max unique user stream errors per actor | 10 |
-
-## system
-
-| Config | Description | Default |
-|--------|-------------|---------|
-| backup_storage_directory | Remote directory for storing snapshots. |  |
-| backup_storage_url | Remote storage url for storing snapshots. |  |
-| barrier_interval_ms | The interval of periodic barrier. | 1000 |
-| block_size_kb | Size of each block in bytes in SST. | 64 |
-| bloom_false_positive | False positive probability of bloom filter. | 0.001 |
-| checkpoint_frequency | There will be a checkpoint for every n barriers. | 1 |
-| data_directory | Remote directory for storing data and metadata objects. |  |
-| enable_tracing | Whether to enable distributed tracing. | false |
-| max_concurrent_creating_streaming_jobs | Max number of concurrent creating streaming jobs. | 1 |
-| parallel_compact_size_mb |  | 512 |
-| pause_on_next_bootstrap | Whether to pause all data sources on next bootstrap. | false |
-| sstable_size_mb | Target size of the Sstable. | 256 |
-| state_store |  |  |