From 9c25008fe39a1e4d0aed907b31feaa861f2918a3 Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sat, 2 Dec 2023 03:35:04 +0000 Subject: [PATCH 1/6] chore: add logs and metrics --- src/datanode/src/alive_keeper.rs | 6 ++++- src/datanode/src/heartbeat.rs | 2 +- .../src/handler/region_lease_handler.rs | 13 +++++++++++ src/meta-srv/src/metrics.rs | 10 +++++++++ src/meta-srv/src/region/lease_keeper.rs | 22 +++++++++++++++++-- 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/datanode/src/alive_keeper.rs b/src/datanode/src/alive_keeper.rs index d8b0b7c50d67..e8bb653d6968 100644 --- a/src/datanode/src/alive_keeper.rs +++ b/src/datanode/src/alive_keeper.rs @@ -129,8 +129,12 @@ impl RegionAliveKeeper { let (role, region_id) = (region.role().into(), RegionId::from(region.region_id)); if let Some(handle) = self.find_handle(region_id).await { handle.reset_deadline(role, deadline).await; + } else { + warn!( + "Trying to renew the lease for region {region_id}, the keeper handler is not found!" + ); + // Else the region alive keeper might be triggered by lagging messages, we can safely ignore it. } - // Else the region alive keeper might be triggered by lagging messages, we can safely ignore it. } } diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 4e2242288c05..1764662930a1 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -72,9 +72,9 @@ impl HeartbeatTask { opts.heartbeat.interval.as_millis() as u64, )); let resp_handler_executor = Arc::new(HandlerGroupExecutor::new(vec![ + region_alive_keeper.clone(), Arc::new(ParseMailboxMessageHandler), Arc::new(RegionHeartbeatResponseHandler::new(region_server.clone())), - region_alive_keeper.clone(), ])); Ok(Self { diff --git a/src/meta-srv/src/handler/region_lease_handler.rs b/src/meta-srv/src/handler/region_lease_handler.rs index 67fcba4519e4..8e72ad098dda 100644 --- a/src/meta-srv/src/handler/region_lease_handler.rs +++ b/src/meta-srv/src/handler/region_lease_handler.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use api::v1::meta::{HeartbeatRequest, RegionLease, Role}; use async_trait::async_trait; use common_meta::key::TableMetadataManagerRef; +use common_telemetry::info; use store_api::region_engine::{GrantedRegion, RegionRole}; use store_api::storage::RegionId; @@ -123,6 +124,12 @@ impl HeartbeatHandler for RegionLeaseHandler { &leaders, RegionRole::Leader, ); + if !closable.is_empty() { + info!( + "Granting region lease, found closable leader regions: {:?} on datanode {}", + closable, datanode_id + ); + } inactive_regions.extend(closable); let followers = followers.into_iter().flatten().collect::>(); @@ -144,6 +151,12 @@ impl HeartbeatHandler for RegionLeaseHandler { &followers, RegionRole::Follower, ); + if !closable.is_empty() { + info!( + "Granting region lease, found closable follower regions {:?} on datanode {}", + closable, datanode_id + ); + } inactive_regions.extend(closable); acc.inactive_region_ids = inactive_regions; diff --git a/src/meta-srv/src/metrics.rs b/src/meta-srv/src/metrics.rs index 8c73c9a7c171..3bd50722d7ca 100644 --- a/src/meta-srv/src/metrics.rs +++ b/src/meta-srv/src/metrics.rs @@ -34,4 +34,14 @@ lazy_static! { pub static ref METRIC_META_LEADER_CACHED_KV_LOAD: HistogramVec = register_histogram_vec!("meta_leader_cache_kv_load", "meta load cache", &["prefix"]) .unwrap(); + pub static ref METRIC_META_LOAD_FOLLOWER_METADATA: Histogram = register_histogram!( + "meta_load_follower_metadata", + "meta load follower regions metadata elapsed" + ) + .unwrap(); + pub static ref METRIC_META_LOAD_LEADER_METADATA: Histogram = register_histogram!( + "meta_load_leader_metadata", + "meta load leader regions metadata elapsed" + ) + .unwrap(); } diff --git a/src/meta-srv/src/region/lease_keeper.rs b/src/meta-srv/src/region/lease_keeper.rs index f9471f7e07e1..06bc4cf382d2 100644 --- a/src/meta-srv/src/region/lease_keeper.rs +++ b/src/meta-srv/src/region/lease_keeper.rs @@ -21,11 +21,13 @@ use std::sync::{Arc, RwLock}; use common_meta::key::table_route::TableRouteValue; use common_meta::key::TableMetadataManagerRef; use common_meta::DatanodeId; +use common_telemetry::warn; use snafu::ResultExt; use store_api::storage::{RegionId, TableId}; use self::mito::find_staled_leader_regions; use crate::error::{self, Result}; +use crate::metrics; use crate::region::lease_keeper::utils::find_staled_follower_regions; pub type RegionLeaseKeeperRef = Arc; @@ -89,7 +91,11 @@ impl RegionLeaseKeeper { ) -> Result<(HashSet, HashSet)> { let tables = self.collect_tables(datanode_regions); let table_ids = tables.keys().copied().collect::>(); - let metadata_subset = self.collect_tables_metadata(&table_ids).await?; + + let metadata_subset = { + let _timer = metrics::METRIC_META_LOAD_LEADER_METADATA.start_timer(); + self.collect_tables_metadata(&table_ids).await? + }; let mut closable_set = HashSet::new(); let mut downgradable_set = HashSet::new(); @@ -104,6 +110,10 @@ impl RegionLeaseKeeper { downgradable_set.extend(downgradable); closable_set.extend(closable); } else { + warn!( + "The table {} metadata is not found, appends closable leader regions: {:?}", + table_id, regions + ); // If table metadata is not found. closable_set.extend(regions); } @@ -128,7 +138,11 @@ impl RegionLeaseKeeper { ) -> Result<(HashSet, HashSet)> { let tables = self.collect_tables(datanode_regions); let table_ids = tables.keys().copied().collect::>(); - let metadata_subset = self.collect_tables_metadata(&table_ids).await?; + + let metadata_subset = { + let _timer = metrics::METRIC_META_LOAD_FOLLOWER_METADATA.start_timer(); + self.collect_tables_metadata(&table_ids).await? + }; let mut upgradable_set = HashSet::new(); let mut closable_set = HashSet::new(); @@ -143,6 +157,10 @@ impl RegionLeaseKeeper { upgradable_set.extend(upgradable); closable_set.extend(closable); } else { + warn!( + "The table {} metadata is not found, appends closable followers regions: {:?}", + table_id, regions + ); // If table metadata is not found. closable_set.extend(regions); } From bb5aa6d45cc8c2db8c449078517bc6ad49f86a7a Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sat, 2 Dec 2023 08:27:24 +0000 Subject: [PATCH 2/6] feat: add the timer to track heartbeat intervel --- src/datanode/src/heartbeat.rs | 13 +++++++++++++ src/datanode/src/metrics.rs | 6 ++++++ 2 files changed, 19 insertions(+) diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 1764662930a1..03c61891b5bf 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -37,6 +37,7 @@ use crate::alive_keeper::RegionAliveKeeper; use crate::config::DatanodeOptions; use crate::error::{self, MetaClientInitSnafu, Result}; use crate::event_listener::RegionServerEventReceiver; +use crate::metrics; use crate::region_server::RegionServer; pub(crate) mod handler; @@ -102,7 +103,13 @@ impl HeartbeatTask { ) -> Result { let client_id = meta_client.id(); + let datanode = format!("datanode-{}", client_id.1); let (tx, mut rx) = meta_client.heartbeat().await.context(MetaClientInitSnafu)?; + + let mut _last_received_lease = metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED + .with_label_values(&[&datanode]) + .start_timer(); + let _handle = common_runtime::spawn_bg(async move { while let Some(res) = match rx.message().await { Ok(m) => m, @@ -114,6 +121,12 @@ impl HeartbeatTask { if let Some(msg) = res.mailbox_message.as_ref() { info!("Received mailbox message: {msg:?}, meta_client id: {client_id:?}"); } + if res.region_lease.as_ref().is_some() { + // Resets the timer. + _last_received_lease = metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED + .with_label_values(&[&datanode]) + .start_timer(); + } let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), res); if let Err(e) = Self::handle_response(ctx, handler_executor.clone()).await { error!(e; "Error while handling heartbeat response"); diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs index f4a2621ecc5d..fdcb7ff61de6 100644 --- a/src/datanode/src/metrics.rs +++ b/src/datanode/src/metrics.rs @@ -26,4 +26,10 @@ lazy_static! { &[REGION_REQUEST_TYPE] ) .unwrap(); + pub static ref LAST_RECEIVED_HEARTBEAT_ELAPSED: HistogramVec = register_histogram_vec!( + "last_received_heartbeat_lease_elapsed", + "last received heartbeat lease elapsed", + &["datanode"] + ) + .unwrap(); } From be9f33840356d626fe964a0ced4fc020054ea7af Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sat, 2 Dec 2023 08:50:56 +0000 Subject: [PATCH 3/6] feat: add the gauge to track region leases --- src/datanode/src/heartbeat.rs | 18 +++++++++++++++++- src/datanode/src/metrics.rs | 14 +++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 03c61891b5bf..e764144e8c32 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -121,11 +121,27 @@ impl HeartbeatTask { if let Some(msg) = res.mailbox_message.as_ref() { info!("Received mailbox message: {msg:?}, meta_client id: {client_id:?}"); } - if res.region_lease.as_ref().is_some() { + if let Some(lease) = res.region_lease.as_ref() { // Resets the timer. _last_received_lease = metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED .with_label_values(&[&datanode]) .start_timer(); + + let mut leader_region_lease_count = 0; + let mut follower_region_lease_count = 0; + for lease in &lease.regions { + match lease.role() { + RegionRole::Leader => leader_region_lease_count += 1, + RegionRole::Follower => follower_region_lease_count += 1, + } + } + + metrics::HEARTBEAT_REGION_LEASES + .with_label_values(&[&datanode, "leader"]) + .set(leader_region_lease_count); + metrics::HEARTBEAT_REGION_LEASES + .with_label_values(&[&datanode, "follower"]) + .set(follower_region_lease_count); } let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), res); if let Err(e) = Self::handle_response(ctx, handler_executor.clone()).await { diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs index fdcb7ff61de6..db3921e9f45d 100644 --- a/src/datanode/src/metrics.rs +++ b/src/datanode/src/metrics.rs @@ -18,6 +18,10 @@ use prometheus::*; /// Region request type label. pub const REGION_REQUEST_TYPE: &str = "datanode_region_request_type"; +pub const DATANODE_ID: &str = "datanode"; + +pub const REGION_ROLE: &str = "region_role"; + lazy_static! { /// The elapsed time of handling a request in the region_server. pub static ref HANDLE_REGION_REQUEST_ELAPSED: HistogramVec = register_histogram_vec!( @@ -26,10 +30,18 @@ lazy_static! { &[REGION_REQUEST_TYPE] ) .unwrap(); + /// The elapsed time since the last received heartbeat. pub static ref LAST_RECEIVED_HEARTBEAT_ELAPSED: HistogramVec = register_histogram_vec!( "last_received_heartbeat_lease_elapsed", "last received heartbeat lease elapsed", - &["datanode"] + &[DATANODE_ID] + ) + .unwrap(); + /// The received region leases via heartbeat. + pub static ref HEARTBEAT_REGION_LEASES: IntGaugeVec = register_int_gauge_vec!( + "heartbeat_region_leases", + "received region leases via heartbeat", + &[DATANODE_ID, REGION_ROLE] ) .unwrap(); } From edb9ac737c15a20dbaddaa09bd0f45add45c8cfb Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sat, 2 Dec 2023 10:52:49 +0000 Subject: [PATCH 4/6] refactor: use gauge instead of the timer --- src/datanode/src/heartbeat.rs | 16 ++++++---------- src/datanode/src/metrics.rs | 7 ++----- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index e764144e8c32..3d9cb4251f33 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -102,13 +102,9 @@ impl HeartbeatTask { quit_signal: Arc, ) -> Result { let client_id = meta_client.id(); - - let datanode = format!("datanode-{}", client_id.1); let (tx, mut rx) = meta_client.heartbeat().await.context(MetaClientInitSnafu)?; - let mut _last_received_lease = metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED - .with_label_values(&[&datanode]) - .start_timer(); + let mut _last_received_lease = Instant::now(); let _handle = common_runtime::spawn_bg(async move { while let Some(res) = match rx.message().await { @@ -122,10 +118,10 @@ impl HeartbeatTask { info!("Received mailbox message: {msg:?}, meta_client id: {client_id:?}"); } if let Some(lease) = res.region_lease.as_ref() { + metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED + .set(_last_received_lease.elapsed().as_millis() as i64); // Resets the timer. - _last_received_lease = metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED - .with_label_values(&[&datanode]) - .start_timer(); + _last_received_lease = Instant::now(); let mut leader_region_lease_count = 0; let mut follower_region_lease_count = 0; @@ -137,10 +133,10 @@ impl HeartbeatTask { } metrics::HEARTBEAT_REGION_LEASES - .with_label_values(&[&datanode, "leader"]) + .with_label_values(&["leader"]) .set(leader_region_lease_count); metrics::HEARTBEAT_REGION_LEASES - .with_label_values(&[&datanode, "follower"]) + .with_label_values(&["follower"]) .set(follower_region_lease_count); } let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), res); diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs index db3921e9f45d..54e2619d7855 100644 --- a/src/datanode/src/metrics.rs +++ b/src/datanode/src/metrics.rs @@ -18,8 +18,6 @@ use prometheus::*; /// Region request type label. pub const REGION_REQUEST_TYPE: &str = "datanode_region_request_type"; -pub const DATANODE_ID: &str = "datanode"; - pub const REGION_ROLE: &str = "region_role"; lazy_static! { @@ -31,17 +29,16 @@ lazy_static! { ) .unwrap(); /// The elapsed time since the last received heartbeat. - pub static ref LAST_RECEIVED_HEARTBEAT_ELAPSED: HistogramVec = register_histogram_vec!( + pub static ref LAST_RECEIVED_HEARTBEAT_ELAPSED: IntGauge = register_int_gauge!( "last_received_heartbeat_lease_elapsed", "last received heartbeat lease elapsed", - &[DATANODE_ID] ) .unwrap(); /// The received region leases via heartbeat. pub static ref HEARTBEAT_REGION_LEASES: IntGaugeVec = register_int_gauge_vec!( "heartbeat_region_leases", "received region leases via heartbeat", - &[DATANODE_ID, REGION_ROLE] + &[REGION_ROLE] ) .unwrap(); } From 5fcd0503cfe178291dda63926d8e332369954a79 Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sat, 2 Dec 2023 13:19:01 +0000 Subject: [PATCH 5/6] chore: apply suggestions from CR --- src/datanode/src/heartbeat.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 3d9cb4251f33..5a4502beb5ef 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -104,7 +104,7 @@ impl HeartbeatTask { let client_id = meta_client.id(); let (tx, mut rx) = meta_client.heartbeat().await.context(MetaClientInitSnafu)?; - let mut _last_received_lease = Instant::now(); + let mut last_received_lease = Instant::now(); let _handle = common_runtime::spawn_bg(async move { while let Some(res) = match rx.message().await { @@ -119,9 +119,9 @@ impl HeartbeatTask { } if let Some(lease) = res.region_lease.as_ref() { metrics::LAST_RECEIVED_HEARTBEAT_ELAPSED - .set(_last_received_lease.elapsed().as_millis() as i64); + .set(last_received_lease.elapsed().as_millis() as i64); // Resets the timer. - _last_received_lease = Instant::now(); + last_received_lease = Instant::now(); let mut leader_region_lease_count = 0; let mut follower_region_lease_count = 0; From 4981860743fc7f16ad57406210dc54ee95313032 Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sun, 3 Dec 2023 14:06:47 +0000 Subject: [PATCH 6/6] feat: add hit rate and etcd txn metrics --- src/common/meta/src/kv_backend/etcd.rs | 6 ++++++ src/meta-srv/src/metrics.rs | 5 +++++ src/meta-srv/src/service/store/cached_kv.rs | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/src/common/meta/src/kv_backend/etcd.rs b/src/common/meta/src/kv_backend/etcd.rs index 5becb4b332b1..806b90150b8e 100644 --- a/src/common/meta/src/kv_backend/etcd.rs +++ b/src/common/meta/src/kv_backend/etcd.rs @@ -68,6 +68,9 @@ impl EtcdStore { async fn do_multi_txn(&self, txn_ops: Vec) -> Result> { if txn_ops.len() < MAX_TXN_SIZE { // fast path + let _timer = METRIC_META_TXN_REQUEST + .with_label_values(&["etcd", "txn"]) + .start_timer(); let txn = Txn::new().and_then(txn_ops); let txn_res = self .client @@ -81,6 +84,9 @@ impl EtcdStore { let txns = txn_ops .chunks(MAX_TXN_SIZE) .map(|part| async move { + let _timer = METRIC_META_TXN_REQUEST + .with_label_values(&["etcd", "txn"]) + .start_timer(); let txn = Txn::new().and_then(part); self.client.kv_client().txn(txn).await }) diff --git a/src/meta-srv/src/metrics.rs b/src/meta-srv/src/metrics.rs index 3bd50722d7ca..493c2bcc7462 100644 --- a/src/meta-srv/src/metrics.rs +++ b/src/meta-srv/src/metrics.rs @@ -44,4 +44,9 @@ lazy_static! { "meta load leader regions metadata elapsed" ) .unwrap(); + pub static ref METRIC_META_KV_CACHE_BATCH_GET_HIT_RATE: Gauge = register_gauge!( + "meta_kv_cache_batch_get_hit_rate", + "meta kv cache batch get hit rate" + ) + .unwrap(); } diff --git a/src/meta-srv/src/service/store/cached_kv.rs b/src/meta-srv/src/service/store/cached_kv.rs index 450a967df112..a1d251971c29 100644 --- a/src/meta-srv/src/service/store/cached_kv.rs +++ b/src/meta-srv/src/service/store/cached_kv.rs @@ -260,6 +260,10 @@ impl KvBackend for LeaderCachedKvBackend { .iter() .map(|kv| kv.key.clone()) .collect::>(); + + let hit_rate = hit_keys.len() as f64 / req.keys.len() as f64; + metrics::METRIC_META_KV_CACHE_BATCH_GET_HIT_RATE.set(hit_rate); + let missed_keys = req .keys .iter()