Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(metrics): clean & restructure some metrics #13195

Merged
merged 3 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

107 changes: 10 additions & 97 deletions grafana/risingwave-dev-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,36 +971,6 @@ def section_streaming_actors(outer_panels):
),
],
),
panels.timeseries_actor_latency(
"Actor Barrier Latency",
"",
[
panels.target(
f"rate({metric('stream_actor_barrier_time')}[$__rate_interval]) > 0",
"{{actor_id}}",
),
],
),
panels.timeseries_actor_latency(
"Actor Processing Time",
"",
[
panels.target(
f"rate({metric('stream_actor_processing_time')}[$__rate_interval]) > 0",
"{{actor_id}}",
),
],
),
panels.timeseries_actor_latency(
"Actor Execution Time",
"",
[
panels.target(
f"rate({metric('stream_actor_actor_execution_time')}[$__rate_interval]) > 0",
"{{actor_id}}",
),
],
),
panels.timeseries_row(
"Actor Input Row",
"",
Expand All @@ -1021,25 +991,6 @@ def section_streaming_actors(outer_panels):
),
],
),
panels.timeseries_actor_ops(
"Join Executor Cache",
"",
[
panels.target(
f"rate({metric('stream_join_lookup_miss_count')}[$__rate_interval])",
"cache miss - {{side}} side, join_table_id {{join_table_id}} degree_table_id {{degree_table_id}} actor {{actor_id}} ",
),
panels.target(
f"rate({metric('stream_join_lookup_total_count')}[$__rate_interval])",
"total lookups {{side}} side, join_table_id {{join_table_id}} degree_table_id {{degree_table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_join_insert_cache_miss_count')}[$__rate_interval])",
"cache miss when insert {{side}} side, join_table_id {{join_table_id}} degree_table_id {{degree_table_id}} actor {{actor_id}}",
),
],
),

panels.timeseries_actor_ops(
"Temporal Join Executor Cache",
"",
Expand Down Expand Up @@ -1183,54 +1134,6 @@ def section_streaming_actors(outer_panels):
),
],
),
panels.timeseries_actor_ops(
"Aggregation Executor Cache Statistics For Each Key/State",
"Lookup miss count counts the number of aggregation key's cache miss per second."
"Lookup total count counts the number of rows processed per second."
"By diving these two metrics, one can derive the cache miss rate per second.",
[
panels.target(
f"rate({metric('stream_agg_lookup_miss_count')}[$__rate_interval])",
"stream agg cache miss - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_agg_lookup_total_count')}[$__rate_interval])",
"stream agg total lookups - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_agg_distinct_cache_miss_count')}[$__rate_interval])",
"distinct agg cache miss - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_agg_distinct_total_cache_count')}[$__rate_interval])",
"distinct agg total lookups - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_group_top_n_cache_miss_count')}[$__rate_interval])",
"group top n cache miss - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_group_top_n_total_query_cache_count')}[$__rate_interval])",
"group top n total lookups - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_group_top_n_appendonly_cache_miss_count')}[$__rate_interval])",
"group top n appendonly cache miss - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_group_top_n_appendonly_total_query_cache_count')}[$__rate_interval])",
"group top n appendonly total lookups - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_lookup_cache_miss_count')}[$__rate_interval])",
"lookup executor cache miss - table {{table_id}} actor {{actor_id}}",
),
panels.target(
f"rate({metric('stream_lookup_total_query_cache_count')}[$__rate_interval])",
"lookup executor total lookups - table {{table_id}} actor {{actor_id}}",
),
],
),
panels.timeseries_actor_ops(
"Aggregation Executor Cache Statistics For Each StreamChunk",
"",
Expand Down Expand Up @@ -1318,6 +1221,16 @@ def section_streaming_actors_tokio(outer_panels):
outer_panels.row_collapsed(
"Streaming Actors (Tokio)",
[
panels.timeseries_actor_latency(
"Actor Execution Time",
"",
[
panels.target(
f"rate({metric('stream_actor_actor_execution_time')}[$__rate_interval]) > 0",
"{{actor_id}}",
),
],
),
panels.timeseries_actor_latency_small(
"Tokio: Actor Fast Poll Time",
"",
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions src/stream/src/executor/monitor/streaming_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ pub struct StreamingMetrics {

// Streaming actor metrics from tokio (disabled by default)
pub actor_execution_time: GenericGaugeVec<AtomicF64>,
pub actor_output_buffer_blocking_duration_ns: LabelGuardedIntCounterVec<3>,
pub actor_input_buffer_blocking_duration_ns: LabelGuardedIntCounterVec<3>,
pub actor_scheduled_duration: GenericGaugeVec<AtomicF64>,
pub actor_scheduled_cnt: GenericGaugeVec<AtomicI64>,
pub actor_fast_poll_duration: GenericGaugeVec<AtomicF64>,
Expand Down Expand Up @@ -72,6 +70,10 @@ pub struct StreamingMetrics {
// Exchange (see also `compute::ExchangeServiceMetrics`)
pub exchange_frag_recv_size: GenericCounterVec<AtomicU64>,

// Backpressure
pub actor_output_buffer_blocking_duration_ns: LabelGuardedIntCounterVec<3>,
pub actor_input_buffer_blocking_duration_ns: LabelGuardedIntCounterVec<3>,

// Streaming Join
pub join_lookup_miss_count: LabelGuardedIntCounterVec<5>,
pub join_lookup_total_count: LabelGuardedIntCounterVec<5>,
Expand Down Expand Up @@ -903,8 +905,6 @@ impl StreamingMetrics {
level,
executor_row_count,
actor_execution_time,
actor_output_buffer_blocking_duration_ns,
actor_input_buffer_blocking_duration_ns,
actor_scheduled_duration,
actor_scheduled_cnt,
actor_fast_poll_duration,
Expand All @@ -924,6 +924,8 @@ impl StreamingMetrics {
sink_input_row_count,
mview_input_row_count,
exchange_frag_recv_size,
actor_output_buffer_blocking_duration_ns,
actor_input_buffer_blocking_duration_ns,
join_lookup_miss_count,
join_lookup_total_count,
join_insert_cache_miss_count,
Expand Down
Loading