From 5d0c385d77e041b5c1a711a57e9c2e274ed40fcc Mon Sep 17 00:00:00 2001 From: yangxiao13 Date: Sun, 20 Aug 2023 13:58:04 +0800 Subject: [PATCH] feat: add maintenance mode (#1586) --- src/common/meta/src/key.rs | 14 +++- src/meta-srv/src/error.rs | 8 ++ src/meta-srv/src/procedure/region_failover.rs | 10 ++- src/meta-srv/src/service/admin.rs | 8 ++ src/meta-srv/src/service/admin/maintenance.rs | 82 +++++++++++++++++++ 5 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 src/meta-srv/src/service/admin/maintenance.rs diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index 6326ced7a607..3c8851dacfdd 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -114,6 +114,10 @@ pub fn to_removed_key(key: &str) -> String { format!("{REMOVED_PREFIX}-{key}") } +pub fn to_maintenance_key(key: &str) -> String { + format!("{REMOVED_PREFIX}-{key}") +} + pub trait TableMetaKey { fn as_raw_key(&self) -> Vec; } @@ -205,12 +209,18 @@ impl_table_meta_value! { #[cfg(test)] mod tests { - use crate::key::to_removed_key; - + use crate::key::{to_maintenance_key, to_removed_key}; #[test] fn test_to_removed_key() { let key = "test_key"; let removed = "__removed-test_key"; assert_eq!(removed, to_removed_key(key)); } + + #[test] + fn test_to_maintenance_key() { + let key = "test_key"; + let maintenance = "__maintenance-test_key"; + assert_eq!(maintenance, to_maintenance_key(key)); + } } diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index 41b41d96d156..ef49d1838750 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -194,6 +194,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to parse bool: {}, source: {}", err_msg, source))] + ParseBool { + err_msg: String, + source: std::str::ParseBoolError, + location: Location, + }, + #[snafu(display("Invalid arguments: {}", err_msg))] InvalidArguments { err_msg: String, location: Location }, @@ -526,6 +533,7 @@ impl ErrorExt for Error { | Error::InvalidLeaseKey { .. } | Error::InvalidStatKey { .. } | Error::ParseNum { .. } + | Error::ParseBool { .. } | Error::UnsupportedSelectorType { .. } | Error::InvalidArguments { .. } | Error::InvalidHeartbeatRequest { .. } diff --git a/src/meta-srv/src/procedure/region_failover.rs b/src/meta-srv/src/procedure/region_failover.rs index 06b0049460ee..a147166087f9 100644 --- a/src/meta-srv/src/procedure/region_failover.rs +++ b/src/meta-srv/src/procedure/region_failover.rs @@ -26,7 +26,7 @@ use std::time::Duration; use async_trait::async_trait; use common_meta::ident::TableIdent; -use common_meta::key::TableMetadataManagerRef; +use common_meta::key::{to_maintenance_key, TableMetadataManagerRef}; use common_meta::{ClusterId, RegionIdent}; use common_procedure::error::{ Error as ProcedureError, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu, @@ -155,6 +155,14 @@ impl RegionFailoverManager { } pub(crate) async fn do_region_failover(&self, failed_region: &RegionIdent) -> Result<()> { + let key = to_maintenance_key( + format!("{}-{}", failed_region.cluster_id, failed_region.datanode_id).as_str(), + ) + .into_bytes(); + if self.in_memory.exists(key.as_slice()).await? { + return Ok(()); + } + let Some(guard) = self.insert_running_procedures(failed_region) else { warn!("Region failover procedure for region {failed_region} is already running!"); return Ok(()); diff --git a/src/meta-srv/src/service/admin.rs b/src/meta-srv/src/service/admin.rs index 0aae00b26064..b8b779b21487 100644 --- a/src/meta-srv/src/service/admin.rs +++ b/src/meta-srv/src/service/admin.rs @@ -15,6 +15,7 @@ mod health; mod heartbeat; mod leader; +mod maintenance; mod meta; mod node_lease; mod route; @@ -89,6 +90,13 @@ pub fn make_admin_service(meta_srv: MetaSrv) -> Admin { }, ); + let router = router.route( + "set-maintenance", + maintenance::MaintenanceHandler { + store: meta_srv.in_memory().clone(), + }, + ); + let router = Router::nest("/admin", router); Admin::new(router) diff --git a/src/meta-srv/src/service/admin/maintenance.rs b/src/meta-srv/src/service/admin/maintenance.rs new file mode 100644 index 000000000000..4692e28d7a78 --- /dev/null +++ b/src/meta-srv/src/service/admin/maintenance.rs @@ -0,0 +1,82 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use common_meta::key::to_maintenance_key; +use common_meta::rpc::store::PutRequest; +use snafu::{OptionExt, ResultExt}; +use tonic::codegen::http; + +use crate::error::{self, Result}; +use crate::service::admin::HttpHandler; +use crate::service::store::kv::ResettableKvStoreRef; +pub struct MaintenanceHandler { + pub store: ResettableKvStoreRef, +} + +#[async_trait::async_trait] +impl HttpHandler for MaintenanceHandler { + async fn handle( + &self, + _: &str, + params: &HashMap, + ) -> Result> { + let cluster_id = params + .get("cluster_id") + .map(|id| id.parse::()) + .context(error::MissingRequiredParameterSnafu { + param: "cluster_id", + })? + .context(error::ParseNumSnafu { + err_msg: "`cluster_id` is not a valid number", + })?; + + let node_id = params + .get("node_id") + .map(|id| id.parse::()) + .context(error::MissingRequiredParameterSnafu { param: "node_id" })? + .context(error::ParseNumSnafu { + err_msg: "`node_id` is not a valid number", + })?; + + let switch_on = params + .get("switch_on") + .map(|on| on.parse::()) + .context(error::MissingRequiredParameterSnafu { param: "switch-on" })? + .context(error::ParseBoolSnafu { + err_msg: "`switch_on` is not a valid bool", + })?; + + let req = PutRequest { + key: to_maintenance_key(format!("{}-{}", cluster_id, node_id).as_str()).into_bytes(), + value: vec![], + prev_kv: false, + }; + + if switch_on { + self.store.put(req).await?; + } else { + self.store.delete(req.key.as_slice(), false).await?; + } + + http::Response::builder() + .status(http::StatusCode::OK) + .body(format!( + "Datanode {}-{} is succeed to be set maintenance mode", + cluster_id, node_id + )) + .context(error::InvalidHttpBodySnafu) + } +}