Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix s3 streaming upload metrics #16929

Merged
merged 5 commits into from
May 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-user-dashboard.json

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions grafana/risingwave-dev-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,8 +532,8 @@ def section_object_storage(outer_panels):
operation_duration_blacklist = "type!~'streaming_upload_write_bytes|streaming_read'"
write_op_filter = "type=~'upload|delete'"
read_op_filter = "type=~'read|readv|list|metadata'"
request_cost_op1 = "type=~'read|streaming_read_start|delete'"
request_cost_op2 = "type=~'upload|streaming_upload_start|s3_upload_part|streaming_upload_finish|delete_objects|list'"
s3_request_cost_op1 = "type=~'read|streaming_read_start|streaming_read_init'"
s3_request_cost_op2 = "type=~'upload|streaming_upload|streaming_upload_start|s3_upload_part|streaming_upload_finish|list'"
return [
outer_panels.row_collapsed(
"Object Storage",
Expand Down Expand Up @@ -641,11 +641,11 @@ def section_object_storage(outer_panels):
True,
),
panels.target(
f"sum({metric('object_store_operation_latency_count', request_cost_op1)}) * 0.0004 / 1000",
f"sum({metric('object_store_operation_latency_count', s3_request_cost_op1)}) * 0.0004 / 1000",
"GET, SELECT, and all other Requests Cost",
),
panels.target(
f"sum({metric('object_store_operation_latency_count', request_cost_op2)}) * 0.005 / 1000",
f"sum({metric('object_store_operation_latency_count', s3_request_cost_op2)}) * 0.005 / 1000",
"PUT, COPY, POST, LIST Requests Cost",
),
],
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion grafana/risingwave-user-dashboard.json

Large diffs are not rendered by default.

58 changes: 38 additions & 20 deletions src/object_store/src/object/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -382,23 +382,32 @@ impl<U: StreamingUploader> MonitoredStreamingUploader<U> {
}
}

/// NOTICE: after #16231, streaming uploader implemented via aws-sdk-s3 will maintain metrics internally in s3.rs
/// so MonitoredStreamingUploader will only be used when the inner object store is opendal.
impl<U: StreamingUploader> MonitoredStreamingUploader<U> {
async fn write_bytes(&mut self, data: Bytes) -> ObjectResult<()> {
let operation_type = OperationType::StreamingUpload;
let operation_type_str = operation_type.as_str();
let data_len = data.len();

let _timer = self
.object_store_metrics
.operation_latency
.with_label_values(&[self.media_type, operation_type_str])
.start_timer();
let res = if self.media_type == "s3" {
// TODO: we should avoid this special case after fully migrating to opeandal for s3.
self.inner
.write_bytes(data)
.verbose_instrument_await(operation_type_str)
.await
} else {
let _timer = self
.object_store_metrics
.operation_latency
.with_label_values(&[self.media_type, operation_type_str])
.start_timer();

let res = self
.inner
.write_bytes(data)
.verbose_instrument_await(operation_type_str)
.await;
self.inner
.write_bytes(data)
.verbose_instrument_await(operation_type_str)
.await
};

try_update_failure_metric(&self.object_store_metrics, &res, operation_type_str);

Expand All @@ -417,17 +426,25 @@ impl<U: StreamingUploader> MonitoredStreamingUploader<U> {
async fn finish(self) -> ObjectResult<()> {
let operation_type = OperationType::StreamingUploadFinish;
let operation_type_str = operation_type.as_str();
let _timer = self
.object_store_metrics
.operation_latency
.with_label_values(&[self.media_type, operation_type_str])
.start_timer();

let res = self
.inner
.finish()
.verbose_instrument_await(operation_type_str)
.await;
let res = if self.media_type == "s3" {
// TODO: we should avoid this special case after fully migrating to opeandal for s3.
self.inner
.finish()
.verbose_instrument_await(operation_type_str)
.await
} else {
let _timer = self
.object_store_metrics
.operation_latency
.with_label_values(&[self.media_type, operation_type_str])
.start_timer();

self.inner
.finish()
.verbose_instrument_await(operation_type_str)
.await
};

try_update_failure_metric(&self.object_store_metrics, &res, operation_type_str);
self.object_store_metrics
Expand Down Expand Up @@ -620,6 +637,7 @@ impl<OS: ObjectStore> MonitoredObjectStore<OS> {
.await;

try_update_failure_metric(&self.object_store_metrics, &res, operation_type_str);

Ok(MonitoredStreamingUploader::new(
media_type,
res?,
Expand Down
46 changes: 33 additions & 13 deletions src/object_store/src/object/s3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use std::sync::Arc;
use std::task::{ready, Context, Poll};
use std::time::Duration;

use await_tree::InstrumentAwait;
use aws_sdk_s3::config::{Credentials, Region};
use aws_sdk_s3::error::BoxError;
use aws_sdk_s3::operation::abort_multipart_upload::AbortMultipartUploadError;
Expand Down Expand Up @@ -311,7 +312,9 @@ impl StreamingUploader for S3StreamingUploader {
self.buf.push(data);

if self.not_uploaded_len >= self.part_size {
self.upload_next_part().await?;
self.upload_next_part()
.verbose_instrument_await("s3_upload_next_part")
.await?;
self.not_uploaded_len = 0;
}
Ok(())
Expand All @@ -331,20 +334,37 @@ impl StreamingUploader for S3StreamingUploader {
debug_assert_eq!(self.not_uploaded_len, 0);
Err(ObjectError::internal("upload empty object"))
} else {
self.client
.put_object()
.bucket(&self.bucket)
.body(get_upload_body(self.buf))
.content_length(self.not_uploaded_len as i64)
.key(&self.key)
.send()
.await
.map_err(|err| {
set_error_should_retry::<PutObjectError>(self.config.clone(), err.into())
})?;
let operation_type = OperationType::Upload;
let builder = || async {
self.client
.put_object()
.bucket(&self.bucket)
.body(get_upload_body(self.buf.clone()))
.content_length(self.not_uploaded_len as i64)
.key(&self.key)
.send()
.verbose_instrument_await("s3_put_object")
.await
.map_err(|err| {
set_error_should_retry::<PutObjectError>(
self.config.clone(),
err.into(),
)
})
};

let res =
retry_request(builder, &self.config, operation_type, self.metrics.clone())
.await;
try_update_failure_metric(&self.metrics, &res, operation_type.as_str());
res?;
Ok(())
}
} else if let Err(e) = self.flush_multipart_and_complete().await {
} else if let Err(e) = self
.flush_multipart_and_complete()
.verbose_instrument_await("s3_flush_multipart_and_complete")
.await
{
tracing::warn!(key = self.key, error = %e.as_report(), "Failed to upload object");
self.abort_multipart_upload().await?;
Err(e)
Expand Down
Loading