Skip to content

Commit

Permalink
feat: Call out the "RESOURCE_EXHAUSTED" error metric (#808)
Browse files Browse the repository at this point in the history
Introduces notification.bridge.error.resource_exhausted

Closes: SYNC-4551
  • Loading branch information
jrconlin authored Dec 17, 2024
1 parent f876304 commit ebc5704
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 71 deletions.
4 changes: 1 addition & 3 deletions .cargo/audit.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
[advisories]
ignore = [
"RUSTSEC-2021-0124", # Bound by tokio restrictions, cargo, reqwest, hyper...
]
ignore = []
5 changes: 1 addition & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ tokio-core = "0.1"
tokio-io = "0.1"
tokio-openssl = "0.6"
uuid = { version = "1.1", features = ["serde", "v4"] }
url = "2.2"
url = "2.5"

autoconnect = { path = "./autoconnect" }
autoconnect_common = { path = "./autoconnect/autoconnect-common" }
Expand Down
41 changes: 19 additions & 22 deletions autoendpoint/src/routers/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use cadence::{Counted, CountedExt, StatsdClient, Timed};
use std::collections::HashMap;
use uuid::Uuid;

use super::fcm::error::FcmError;

/// Convert a notification into a WebPush message
pub fn build_message_data(notification: &Notification) -> ApiResult<HashMap<&'static str, String>> {
let mut message_data = HashMap::new();
Expand Down Expand Up @@ -44,6 +46,12 @@ pub fn message_size_check(data: &[u8], max_data: usize) -> Result<(), RouterErro
}

/// Handle a bridge error by logging, updating metrics, etc
/// This function uses the standard `slog` recording mechanisms and
/// optionally calls a generic metric recording function for some
/// types of errors. The error is returned by this function for later
/// processing. This can include being called by the sentry middleware,
/// which uses the `RecordableError` trait to optionally record metrics.
/// see [autopush_common::middleware::sentry::SentryWrapperMiddleware].`call()` method
pub async fn handle_error(
error: RouterError,
metrics: &StatsdClient,
Expand All @@ -65,17 +73,6 @@ pub async fn handle_error(
error.errno(),
);
}
RouterError::GCMAuthentication => {
warn!("GCM Authentication error");
incr_error_metric(
metrics,
platform,
app_id,
"gcm authentication",
error.status(),
error.errno(),
);
}
RouterError::RequestTimeout => {
// Bridge timeouts are common.
info!("Bridge timeout");
Expand Down Expand Up @@ -114,17 +111,6 @@ pub async fn handle_error(
warn!("Error while removing user due to bridge not_found: {}", e);
}
}
RouterError::Upstream { .. } => {
warn!("{}", error.to_string());
incr_error_metric(
metrics,
platform,
app_id,
"server_error",
error.status(),
error.errno(),
);
}
RouterError::TooMuchData(_) => {
// Do not log this error since it's fairly common.
incr_error_metric(
Expand All @@ -136,6 +122,17 @@ pub async fn handle_error(
error.errno(),
);
}
RouterError::Fcm(FcmError::Upstream {
error_code: status, ..
}) => incr_error_metric(
metrics,
platform,
app_id,
&format!("upstream_{}", status),
error.status(),
error.errno(),
),

_ => {
warn!("Unknown error while sending bridge request: {}", error);
incr_error_metric(
Expand Down
36 changes: 26 additions & 10 deletions autoendpoint/src/routers/fcm/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,28 @@ impl FcmClient {
(StatusCode::NOT_FOUND, _) => RouterError::NotFound,
(_, Some(error)) => {
info!("🌉Bridge Error: {:?}, {:?}", error.message, &self.endpoint);
RouterError::Upstream {
status: error.status,
FcmError::Upstream {
error_code: error.status, // Note: this is the FCM error status enum value
message: error.message,
}
.into()
}
(status, None) => RouterError::Upstream {
status: status.to_string(),
message: "Unknown reason".to_string(),
},
// In this case, we've gotten an error, but FCM hasn't returned a body.
// (This may happen in the case where FCM terminates the connection abruptly
// or a similar event.) Treat that as an INTERNAL error.
(_, None) => {
warn!(
"🌉Unknown Bridge Error: {:?}, <{:?}>, [{:?}]",
status.to_string(),
&self.endpoint,
raw_data,
);
FcmError::Upstream {
error_code: "UNKNOWN".to_string(),
message: format!("Unknown reason: {:?}", status.to_string()),
}
}
.into(),
});
}

Expand All @@ -174,15 +187,18 @@ struct FcmResponse {
error: Option<FcmErrorResponse>,
}

/// Response message from FCM in the case of an error.
#[derive(Deserialize)]
struct FcmErrorResponse {
/// The ErrorCode enum as string from https://firebase.google.com/docs/reference/fcm/rest/v1/ErrorCode
status: String,
message: String,
}

#[cfg(test)]
pub mod tests {
use crate::routers::fcm::client::FcmClient;
use crate::routers::fcm::error::FcmError;
use crate::routers::fcm::settings::{FcmServerCredential, FcmSettings};
use crate::routers::RouterError;
use std::collections::HashMap;
Expand Down Expand Up @@ -368,8 +384,8 @@ pub mod tests {
assert!(
matches!(
result.as_ref().unwrap_err(),
RouterError::Upstream { status, message }
if status == "TEST_ERROR" && message == "test-message"
RouterError::Fcm(FcmError::Upstream{ error_code, message })
if error_code == "TEST_ERROR" && message == "test-message"
),
"result = {result:?}"
);
Expand Down Expand Up @@ -403,8 +419,8 @@ pub mod tests {
assert!(
matches!(
result.as_ref().unwrap_err(),
RouterError::Upstream { status, message }
if status == "400 Bad Request" && message == "Unknown reason"
RouterError::Fcm(FcmError::Upstream { error_code, message })
if error_code == "UNKNOWN" && message.starts_with("Unknown reason")
),
"result = {result:?}"
);
Expand Down
26 changes: 14 additions & 12 deletions autoendpoint/src/routers/fcm/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ pub enum FcmError {

#[error("User has invalid app ID {0}")]
InvalidAppId(String),

#[error("Upstream error, {error_code}: {message}")]
Upstream { error_code: String, message: String },
}

impl FcmError {
Expand All @@ -53,7 +56,8 @@ impl FcmError {

FcmError::DeserializeResponse(_)
| FcmError::EmptyResponse(_)
| FcmError::InvalidResponse(_, _, _) => StatusCode::BAD_GATEWAY,
| FcmError::InvalidResponse(_, _, _)
| FcmError::Upstream { .. } => StatusCode::BAD_GATEWAY,
}
}

Expand All @@ -64,13 +68,7 @@ impl FcmError {
Some(106)
}

FcmError::CredentialDecode(_)
| FcmError::OAuthClientBuild(_)
| FcmError::OAuthToken(_)
| FcmError::DeserializeResponse(_)
| FcmError::EmptyResponse(_)
| FcmError::InvalidResponse(_, _, _)
| FcmError::NoOAuthToken => None,
_ => None,
}
}
}
Expand All @@ -88,24 +86,28 @@ impl ReportableError for FcmError {

fn metric_label(&self) -> Option<&'static str> {
match &self {
FcmError::InvalidAppId(_) | FcmError::NoAppId => {
Some("notification.bridge.error.fcm.badappid")
}
FcmError::InvalidAppId(_) | FcmError::NoAppId => Some("notification.bridge.error"),
_ => None,
}
}

fn extras(&self) -> Vec<(&str, String)> {
match self {
FcmError::InvalidAppId(appid) => {
vec![("app_id", appid.to_string())]
vec![
("status", "bad_appid".to_owned()),
("app_id", appid.to_string()),
]
}
FcmError::EmptyResponse(status) => {
vec![("status", status.to_string())]
}
FcmError::InvalidResponse(_, body, status) => {
vec![("status", status.to_string()), ("body", body.to_owned())]
}
FcmError::Upstream { error_code, .. } => {
vec![("status", error_code.clone())]
}
_ => vec![],
}
}
Expand Down
24 changes: 5 additions & 19 deletions autoendpoint/src/routers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,6 @@ pub enum RouterError {
#[error("Bridge authentication error")]
Authentication,

#[error("GCM Bridge authentication error")]
GCMAuthentication,

#[error("Bridge request timeout")]
RequestTimeout,

Expand All @@ -113,9 +110,6 @@ pub enum RouterError {

#[error("Bridge reports user was not found")]
NotFound,

#[error("Bridge error, {status}: {message}")]
Upstream { status: String, message: String },
}

impl RouterError {
Expand All @@ -133,11 +127,9 @@ impl RouterError {

RouterError::TooMuchData(_) => StatusCode::PAYLOAD_TOO_LARGE,

RouterError::Authentication
| RouterError::GCMAuthentication
| RouterError::RequestTimeout
| RouterError::Connect(_)
| RouterError::Upstream { .. } => StatusCode::BAD_GATEWAY,
RouterError::Authentication | RouterError::RequestTimeout | RouterError::Connect(_) => {
StatusCode::BAD_GATEWAY
}
}
}

Expand All @@ -163,10 +155,6 @@ impl RouterError {
RouterError::Connect(_) => Some(902),

RouterError::RequestTimeout => Some(903),

RouterError::GCMAuthentication => Some(904),

RouterError::Upstream { .. } => None,
}
}
}
Expand All @@ -188,12 +176,10 @@ impl ReportableError for RouterError {
RouterError::Fcm(e) => e.is_sentry_event(),
// common handle_error emits metrics for these
RouterError::Authentication
| RouterError::GCMAuthentication
| RouterError::Connect(_)
| RouterError::NotFound
| RouterError::RequestTimeout
| RouterError::TooMuchData(_)
| RouterError::Upstream { .. } => false,
| RouterError::TooMuchData(_) => false,
RouterError::SaveDb(e, _) => e.is_sentry_event(),
_ => true,
}
Expand All @@ -202,7 +188,7 @@ impl ReportableError for RouterError {
fn metric_label(&self) -> Option<&'static str> {
// NOTE: Some metrics are emitted for other Errors via handle_error
// callbacks, whereas some are emitted via this method. These 2 should
// be consoliated: https://mozilla-hub.atlassian.net/browse/SYNC-3695
// be consolidated: https://mozilla-hub.atlassian.net/browse/SYNC-3695
match self {
RouterError::Apns(e) => e.metric_label(),
RouterError::Fcm(e) => e.metric_label(),
Expand Down

0 comments on commit ebc5704

Please sign in to comment.