Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(metrics): add metrics for streaming actor lru cache eviction #15858

Merged
merged 8 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ arrow-schema-deltalake = { package = "arrow-schema", version = "48.0.1" }
deltalake = { git = "https://github.com/risingwavelabs/delta-rs", rev = "5c2dccd4640490202ffe98adbd13b09cef8e007b", features = [
"s3-no-concurrent-write",
] }
lru = { git = "https://github.com/risingwavelabs/lru-rs.git", rev = "95f347b" }
lru = { git = "https://github.com/risingwavelabs/lru-rs.git", rev = "2682b85" }
parquet = "50"
thiserror-ext = "0.0.11"
tikv-jemalloc-ctl = { git = "https://github.com/risingwavelabs/jemallocator.git", rev = "64a2d9" }
Expand Down
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-user-dashboard.json

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions grafana/risingwave-dev-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,20 @@ def section_streaming_actors(outer_panels):
),
],
),
panels.timeseries_bytes(
"Executor Evicted Cache Memory",
"The operator-level evicted memory statistics collected by each LRU cache",
[
panels.target(
f"sum({metric('stream_memory_evicted')}) by (table_id, epoch)",
"table {{table_id}} epoch: {{epoch}}",
),
panels.target_hidden(
f"{metric('stream_memory_evicted')}",
"table {{table_id}} actor {{actor_id}} epoch: {{epoch}}",
),
],
),
panels.timeseries_bytes(
"Executor Cache Memory Usage of Materialized Views",
"Memory usage aggregated by materialized views",
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion grafana/risingwave-user-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/common/estimate_size/src/collections/lru.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al

/// Evict epochs lower than the watermark
pub fn evict_by_epoch(&mut self, epoch: u64) {
while let Some((key, value)) = self.inner.pop_lru_by_epoch(epoch) {
while let Some((key, value, _)) = self.inner.pop_lru_by_epoch(epoch) {
self.kv_heap_size.sub(&key, &value);
}
}
Expand Down
36 changes: 32 additions & 4 deletions src/stream/src/cache/managed_lru.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;

use lru::{DefaultHasher, LruCache};
use risingwave_common::metrics::LabelGuardedIntGauge;
use risingwave_common::metrics::{LabelGuardedIntGauge, LabelGuardedIntGaugeVec};
use risingwave_common::util::epoch::Epoch;
use risingwave_common_estimate_size::EstimateSize;

Expand All @@ -40,10 +40,11 @@ pub struct ManagedLruCache<K, V, S = DefaultHasher, A: Clone + Allocator = Globa
kv_heap_size: usize,
/// The metrics of memory usage
memory_usage_metrics: LabelGuardedIntGauge<3>,
/// The metrics of evicted memory by epoch.
memory_evicted_metrics: LabelGuardedIntGaugeVec<3>,
// The metrics of evicted watermark time
lru_evicted_watermark_time_ms: LabelGuardedIntGauge<3>,
// Metrics info
#[expect(dead_code)]
metrics_info: MetricsInfo,
/// The size reported last time
last_reported_size_bytes: usize,
Expand Down Expand Up @@ -73,6 +74,8 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al
]);
memory_usage_metrics.set(0.into());

let memory_evicted_metrics = metrics_info.metrics.stream_memory_evicted.clone();

let lru_evicted_watermark_time_ms = metrics_info
.metrics
.lru_evicted_watermark_time_ms
Expand All @@ -87,6 +90,7 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al
watermark_epoch,
kv_heap_size: 0,
memory_usage_metrics,
memory_evicted_metrics,
lru_evicted_watermark_time_ms,
metrics_info,
last_reported_size_bytes: 0,
Expand All @@ -106,8 +110,32 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al

/// Evict epochs lower than the watermark
fn evict_by_epoch(&mut self, epoch: u64) {
while let Some((key, value)) = self.inner.pop_lru_by_epoch(epoch) {
self.kv_heap_size_dec(key.estimated_size() + value.estimated_size());
let report = |lru: &Self, epoch: u64, evicted: usize| {
lru.memory_evicted_metrics
.with_guarded_label_values(&[
&lru.metrics_info.table_id,
&lru.metrics_info.actor_id,
&epoch.to_string(),
])
.set(evicted as _);
};

let mut last_epoch = 0; // real epoch must be greater than 0
let mut evicted = 0;

while let Some((key, value, e)) = self.inner.pop_lru_by_epoch(epoch) {
let charge = key.estimated_size() + value.estimated_size();
self.kv_heap_size_dec(charge);
// The popped epoch must be monotonically decreasing.
if e != last_epoch {
report(self, last_epoch, evicted);
last_epoch = e;
evicted = 0;
}
evicted += charge;
}
if evicted > 0 {
report(self, last_epoch, evicted);
}
self.report_evicted_watermark_time(epoch);
}
Expand Down
10 changes: 10 additions & 0 deletions src/stream/src/executor/monitor/streaming_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ pub struct StreamingMetrics {

// Memory
pub stream_memory_usage: LabelGuardedIntGaugeVec<3>,
pub stream_memory_evicted: LabelGuardedIntGaugeVec<3>,
}

pub static GLOBAL_STREAMING_METRICS: OnceLock<StreamingMetrics> = OnceLock::new();
Expand Down Expand Up @@ -998,6 +999,14 @@ impl StreamingMetrics {
)
.unwrap();

let stream_memory_evicted = register_guarded_int_gauge_vec_with_registry!(
"stream_memory_evicted",
"Memory evicted for stream executors",
&["table_id", "actor_id", "epoch"],
registry
)
.unwrap();

let iceberg_write_qps = register_guarded_int_counter_vec_with_registry!(
"iceberg_write_qps",
"The qps of iceberg writer",
Expand Down Expand Up @@ -1139,6 +1148,7 @@ impl StreamingMetrics {
materialize_cache_hit_count,
materialize_cache_total_count,
stream_memory_usage,
stream_memory_evicted,
}
}

Expand Down
Loading