From 69ea900a976ecf2d9f206e9d90163c4f88a43e88 Mon Sep 17 00:00:00 2001 From: Eric Fu Date: Mon, 11 Sep 2023 11:54:31 +0800 Subject: [PATCH] feat: auto heap dump by default if `MALLOC_CONF=prof:true` (#12186) --- Cargo.lock | 6 ++-- src/batch/Cargo.toml | 2 +- src/cmd/Cargo.toml | 6 +++- src/cmd_all/Cargo.toml | 6 +++- src/common/src/config.rs | 25 ++++++------- src/compute/Cargo.toml | 2 +- .../src/memory_management/memory_manager.rs | 2 +- src/compute/src/memory_management/mod.rs | 15 ++++---- src/compute/src/memory_management/policy.rs | 35 +++++++++++++++---- src/config/example.toml | 1 + src/tests/simulation/Cargo.toml | 4 ++- 11 files changed, 69 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 279c4c1c2d563..d90809e747c76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8979,7 +8979,7 @@ dependencies = [ [[package]] name = "tikv-jemalloc-ctl" version = "0.5.4" -source = "git+https://github.com/yuhao-su/jemallocator.git?rev=a0911601bb7bb263ca55c7ea161ef308fdc623f8#a0911601bb7bb263ca55c7ea161ef308fdc623f8" +source = "git+https://github.com/risingwavelabs/jemallocator.git?rev=b7f9f3#b7f9f34664dcfea190e64bef64587e23f9f2710c" dependencies = [ "libc", "paste", @@ -8989,7 +8989,7 @@ dependencies = [ [[package]] name = "tikv-jemalloc-sys" version = "0.5.4+5.3.0-patched" -source = "git+https://github.com/yuhao-su/jemallocator.git?rev=a0911601bb7bb263ca55c7ea161ef308fdc623f8#a0911601bb7bb263ca55c7ea161ef308fdc623f8" +source = "git+https://github.com/risingwavelabs/jemallocator.git?rev=b7f9f3#b7f9f34664dcfea190e64bef64587e23f9f2710c" dependencies = [ "cc", "libc", @@ -8998,7 +8998,7 @@ dependencies = [ [[package]] name = "tikv-jemallocator" version = "0.5.4" -source = "git+https://github.com/yuhao-su/jemallocator.git?rev=a0911601bb7bb263ca55c7ea161ef308fdc623f8#a0911601bb7bb263ca55c7ea161ef308fdc623f8" +source = "git+https://github.com/risingwavelabs/jemallocator.git?rev=b7f9f3#b7f9f34664dcfea190e64bef64587e23f9f2710c" dependencies = [ "libc", "tikv-jemalloc-sys", diff --git a/src/batch/Cargo.toml b/src/batch/Cargo.toml index f3c5032d5619a..5f90151a400ab 100644 --- a/src/batch/Cargo.toml +++ b/src/batch/Cargo.toml @@ -67,7 +67,7 @@ rand = "0.8" tempfile = "3" [target.'cfg(unix)'.dev-dependencies] -tikv-jemallocator = { git = "https://github.com/yuhao-su/jemallocator.git", rev = "a0911601bb7bb263ca55c7ea161ef308fdc623f8" } +tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", rev = "b7f9f3" } [[bench]] name = "filter" diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 45b715ebcb206..46a33654d3545 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -46,7 +46,11 @@ workspace-hack = { path = "../workspace-hack" } task_stats_alloc = { path = "../utils/task_stats_alloc" } [target.'cfg(unix)'.dependencies] -tikv-jemallocator = { git = "https://github.com/yuhao-su/jemallocator.git", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"], rev = "a0911601bb7bb263ca55c7ea161ef308fdc623f8" } +tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", features = [ + "profiling", + "stats", + "unprefixed_malloc_on_supported_platforms", +], rev = "b7f9f3" } [[bin]] name = "frontend" diff --git a/src/cmd_all/Cargo.toml b/src/cmd_all/Cargo.toml index 3c3b207637b10..b6907cdebaeff 100644 --- a/src/cmd_all/Cargo.toml +++ b/src/cmd_all/Cargo.toml @@ -59,7 +59,11 @@ vergen = { version = "8", default-features = false, features = ["build", "git", task_stats_alloc = { path = "../utils/task_stats_alloc" } [target.'cfg(unix)'.dependencies] -tikv-jemallocator = { git = "https://github.com/yuhao-su/jemallocator.git", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"], rev = "a0911601bb7bb263ca55c7ea161ef308fdc623f8" } +tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", features = [ + "profiling", + "stats", + "unprefixed_malloc_on_supported_platforms", +], rev = "b7f9f3" } [[bin]] name = "risingwave" diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 6381fc7ac4342..8c32a1a0b80c8 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -370,7 +370,7 @@ pub struct ServerConfig { pub unrecognized: Unrecognized, /// Enable heap profile dump when memory usage is high. - #[serde(default = "default::server::auto_dump_heap_profile")] + #[serde(default)] pub auto_dump_heap_profile: AutoDumpHeapProfileConfig, } @@ -658,18 +658,19 @@ impl AsyncStackTraceOption { #[derive(Clone, Debug, Serialize, Deserialize, DefaultFromSerde)] pub struct AutoDumpHeapProfileConfig { + /// Enable to auto dump heap profile when memory usage is high + #[serde(default = "default::auto_dump_heap_profile::enabled")] + pub enabled: bool, + + /// The directory to dump heap profile. If empty, the prefix in `MALLOC_CONF` will be used #[serde(default = "default::auto_dump_heap_profile::dir")] pub dir: String, + + /// The proportion (number between 0 and 1) of memory usage to trigger heap profile dump #[serde(default = "default::auto_dump_heap_profile::threshold")] pub threshold: f32, } -impl AutoDumpHeapProfileConfig { - pub fn enabled(&self) -> bool { - !self.dir.is_empty() - } -} - serde_with::with_prefix!(streaming_prefix "stream_"); serde_with::with_prefix!(batch_prefix "batch_"); @@ -907,7 +908,7 @@ pub mod default { } pub mod server { - use crate::config::{AutoDumpHeapProfileConfig, MetricLevel}; + use crate::config::MetricLevel; pub fn heartbeat_interval_ms() -> u32 { 1000 @@ -924,10 +925,6 @@ pub mod default { pub fn telemetry_enabled() -> bool { true } - - pub fn auto_dump_heap_profile() -> AutoDumpHeapProfileConfig { - Default::default() - } } pub mod storage { @@ -1129,6 +1126,10 @@ pub mod default { } pub mod auto_dump_heap_profile { + pub fn enabled() -> bool { + true + } + pub fn dir() -> String { "".to_string() } diff --git a/src/compute/Cargo.toml b/src/compute/Cargo.toml index 8276f93db8ae7..7774a2d917731 100644 --- a/src/compute/Cargo.toml +++ b/src/compute/Cargo.toml @@ -55,7 +55,7 @@ tower = { version = "0.4", features = ["util", "load-shed"] } tracing = "0.1" [target.'cfg(target_os = "linux")'.dependencies] -tikv-jemalloc-ctl = { git = "https://github.com/yuhao-su/jemallocator.git", rev = "a0911601bb7bb263ca55c7ea161ef308fdc623f8" } +tikv-jemalloc-ctl = { git = "https://github.com/risingwavelabs/jemallocator.git", rev = "b7f9f3" } [target.'cfg(not(madsim))'.dependencies] workspace-hack = { path = "../workspace-hack" } diff --git a/src/compute/src/memory_management/memory_manager.rs b/src/compute/src/memory_management/memory_manager.rs index 1c43cdb29fdab..9481ef0ebef71 100644 --- a/src/compute/src/memory_management/memory_manager.rs +++ b/src/compute/src/memory_management/memory_manager.rs @@ -54,7 +54,7 @@ impl GlobalMemoryManager { .unwrap(); tracing::info!("memory control policy: {:?}", &memory_control_policy); - if auto_dump_heap_profile_config.enabled() { + if auto_dump_heap_profile_config.enabled { fs::create_dir_all(&auto_dump_heap_profile_config.dir).unwrap(); } Arc::new(Self { diff --git a/src/compute/src/memory_management/mod.rs b/src/compute/src/memory_management/mod.rs index 23f5303eabc72..f9553f860ae41 100644 --- a/src/compute/src/memory_management/mod.rs +++ b/src/compute/src/memory_management/mod.rs @@ -74,15 +74,8 @@ pub fn build_memory_control_policy( total_memory_bytes: usize, auto_dump_heap_profile_config: AutoDumpHeapProfileConfig, ) -> Result { - use risingwave_common::bail; - use tikv_jemalloc_ctl::opt; - use self::policy::JemallocMemoryControl; - if !opt::prof::read().unwrap() && auto_dump_heap_profile_config.enabled() { - bail!("Auto heap profile dump should not be enabled with Jemalloc profile disable"); - } - Ok(Box::new(JemallocMemoryControl::new( total_memory_bytes, auto_dump_heap_profile_config, @@ -122,6 +115,14 @@ impl MemoryControl for DummyPolicy { /// overhead, network buffer, etc. based on `SYSTEM_RESERVED_MEMORY_PROPORTION`. The reserve memory /// size must be larger than `MIN_SYSTEM_RESERVED_MEMORY_MB` pub fn reserve_memory_bytes(total_memory_bytes: usize) -> (usize, usize) { + if total_memory_bytes < MIN_COMPUTE_MEMORY_MB << 20 { + panic!( + "The total memory size ({}) is too small. It must be at least {} MB.", + convert(total_memory_bytes as _), + MIN_COMPUTE_MEMORY_MB + ); + } + let reserved = std::cmp::max( (total_memory_bytes as f64 * SYSTEM_RESERVED_MEMORY_PROPORTION).ceil() as usize, MIN_SYSTEM_RESERVED_MEMORY_MB << 20, diff --git a/src/compute/src/memory_management/policy.rs b/src/compute/src/memory_management/policy.rs index 085d7cfcf98a5..5c3602a46fe68 100644 --- a/src/compute/src/memory_management/policy.rs +++ b/src/compute/src/memory_management/policy.rs @@ -22,7 +22,9 @@ use risingwave_batch::task::BatchManager; use risingwave_common::config::AutoDumpHeapProfileConfig; use risingwave_common::util::epoch::Epoch; use risingwave_stream::task::LocalStreamManager; -use tikv_jemalloc_ctl::{epoch as jemalloc_epoch, prof as jemalloc_prof, stats as jemalloc_stats}; +use tikv_jemalloc_ctl::{ + epoch as jemalloc_epoch, opt as jemalloc_opt, prof as jemalloc_prof, stats as jemalloc_stats, +}; use super::{MemoryControl, MemoryControlStats}; @@ -100,22 +102,39 @@ impl JemallocMemoryControl { } fn dump_heap_prof(&self, cur_used_memory_bytes: usize, prev_used_memory_bytes: usize) { - if !self.auto_dump_heap_profile_config.enabled() { + if !self.auto_dump_heap_profile_config.enabled { return; } + if cur_used_memory_bytes > self.threshold_auto_dump_heap_profile && prev_used_memory_bytes <= self.threshold_auto_dump_heap_profile { + let opt_prof = jemalloc_opt::prof::read().unwrap(); + if !opt_prof { + tracing::info!("Cannot dump heap profile because Jemalloc prof is not enabled"); + return; + } + let time_prefix = chrono::Local::now().format("%Y-%m-%d-%H-%M-%S").to_string(); let file_name = format!( "{}.exceed-threshold-aggressive-heap-prof.compute.dump.{}\0", time_prefix, self.dump_seq, ); - let file_path = Path::new(&self.auto_dump_heap_profile_config.dir) - .join(Path::new(&file_name)) - .to_str() - .unwrap() - .to_string(); + + let file_path = if !self.auto_dump_heap_profile_config.dir.is_empty() { + Path::new(&self.auto_dump_heap_profile_config.dir) + .join(Path::new(&file_name)) + .to_str() + .unwrap() + .to_string() + } else { + let prof_prefix_mib = jemalloc_prof::prefix::mib().unwrap(); + let prof_prefix = prof_prefix_mib.read().unwrap(); + let mut file_path = prof_prefix.to_string_lossy().to_string(); + file_path.push_str(&file_name); + file_path + }; + let file_path_str = Box::leak(file_path.into_boxed_str()); let file_path_bytes = unsafe { file_path_str.as_bytes_mut() }; let file_path_ptr = file_path_bytes.as_mut_ptr(); @@ -124,6 +143,8 @@ impl JemallocMemoryControl { .write(CStr::from_bytes_with_nul(file_path_bytes).unwrap()) { tracing::warn!("Auto Jemalloc dump heap file failed! {:?}", e); + } else { + tracing::info!("Successfully dumped heap profile to {}", file_name); } let _ = unsafe { Box::from_raw(file_path_ptr) }; } diff --git a/src/config/example.toml b/src/config/example.toml index 0e6d50e8f3f1e..b088843c66c65 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -7,6 +7,7 @@ metrics_level = "Info" telemetry_enabled = true [server.auto_dump_heap_profile] +enabled = true dir = "" threshold = 0.8999999761581421 diff --git a/src/tests/simulation/Cargo.toml b/src/tests/simulation/Cargo.toml index e4f0d5c99bc82..bb6f841907f57 100644 --- a/src/tests/simulation/Cargo.toml +++ b/src/tests/simulation/Cargo.toml @@ -46,7 +46,9 @@ serde_derive = "1.0.183" serde_json = "1.0.105" sqllogictest = "0.15.2" tempfile = "3" -tikv-jemallocator = { git = "https://github.com/yuhao-su/jemallocator.git", features = ["profiling"], rev = "a0911601bb7bb263ca55c7ea161ef308fdc623f8" } +tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", features = [ + "profiling", +], rev = "b7f9f3" } tokio = { version = "0.2.23", package = "madsim-tokio" } tokio-postgres = "0.7" tracing = "0.1"