-
Notifications
You must be signed in to change notification settings - Fork 332
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add maintenance mode (#1586) #2211
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,6 +106,9 @@ impl HeartbeatHandler for RegionFailureHandler { | |
|
||
#[cfg(test)] | ||
mod tests { | ||
|
||
use common_meta::key::MAINTENANCE_KEY; | ||
|
||
use super::*; | ||
use crate::handler::node_stat::{RegionStat, Stat}; | ||
use crate::metasrv::builder::MetaSrvBuilder; | ||
|
@@ -155,4 +158,63 @@ mod tests { | |
let dump = handler.failure_detect_runner.dump().await; | ||
assert_eq!(dump.iter().collect::<Vec<_>>().len(), 0); | ||
} | ||
|
||
#[tokio::test(flavor = "multi_thread")] | ||
async fn test_maintenance_mode() { | ||
let region_failover_manager = create_region_failover_manager(); | ||
let in_memory = region_failover_manager.create_context().in_memory.clone(); | ||
let handler = RegionFailureHandler::try_new(None, region_failover_manager.clone()) | ||
.await | ||
.unwrap(); | ||
|
||
let req = &HeartbeatRequest::default(); | ||
|
||
let builder = MetaSrvBuilder::new(); | ||
let metasrv = builder.build().await.unwrap(); | ||
let mut ctx = metasrv.new_ctx(); | ||
ctx.is_infancy = false; | ||
|
||
let acc = &mut HeartbeatAccumulator::default(); | ||
fn new_region_stat(region_id: u64) -> RegionStat { | ||
RegionStat { | ||
id: region_id, | ||
rcus: 0, | ||
wcus: 0, | ||
approximate_bytes: 0, | ||
approximate_rows: 0, | ||
} | ||
} | ||
acc.stat = Some(Stat { | ||
cluster_id: 1, | ||
id: 42, | ||
region_stats: vec![new_region_stat(1), new_region_stat(2), new_region_stat(3)], | ||
timestamp_millis: 1000, | ||
..Default::default() | ||
}); | ||
|
||
handler.handle(req, &mut ctx, acc).await.unwrap(); | ||
let dump = handler.failure_detect_runner.dump().await; | ||
assert_eq!(dump.iter().collect::<Vec<_>>().len(), 3); | ||
|
||
let kv_req = common_meta::rpc::store::PutRequest { | ||
key: Vec::from(MAINTENANCE_KEY), | ||
value: vec![], | ||
prev_kv: false, | ||
}; | ||
|
||
let _ = in_memory.put(kv_req.clone()).await.unwrap(); | ||
|
||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we test this without sleep? @fengjiachun @MichaelScofield There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't think it's needed. |
||
let dump = handler.failure_detect_runner.dump().await; | ||
let maintenance_node = region_failover_manager.is_maintenance_node().await.unwrap(); | ||
assert_eq!(dump.iter().collect::<Vec<_>>().len(), 0); | ||
assert!(maintenance_node); | ||
|
||
let _ = in_memory | ||
.delete(kv_req.key.as_slice(), false) | ||
.await | ||
.unwrap(); | ||
let maintenance_node = region_failover_manager.is_maintenance_node().await.unwrap(); | ||
assert!(!maintenance_node); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -27,7 +27,7 @@ use std::time::Duration; | |||||
use async_trait::async_trait; | ||||||
use common_meta::ident::TableIdent; | ||||||
use common_meta::key::datanode_table::DatanodeTableKey; | ||||||
use common_meta::key::TableMetadataManagerRef; | ||||||
use common_meta::key::{TableMetadataManagerRef, MAINTENANCE_KEY}; | ||||||
use common_meta::{ClusterId, RegionIdent}; | ||||||
use common_procedure::error::{ | ||||||
Error as ProcedureError, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu, | ||||||
|
@@ -158,6 +158,10 @@ impl RegionFailoverManager { | |||||
} | ||||||
|
||||||
pub(crate) async fn do_region_failover(&self, failed_region: &RegionIdent) -> Result<()> { | ||||||
if self.is_maintenance_node().await? { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be
Suggested change
|
||||||
return Ok(()); | ||||||
} | ||||||
|
||||||
let Some(guard) = self.insert_running_procedures(failed_region) else { | ||||||
warn!("Region failover procedure for region {failed_region} is already running!"); | ||||||
return Ok(()); | ||||||
|
@@ -236,6 +240,10 @@ impl RegionFailoverManager { | |||||
}) | ||||||
.unwrap_or_default()) | ||||||
} | ||||||
#[allow(dead_code)] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: Why allow dead code?
Suggested change
|
||||||
pub(crate) async fn is_maintenance_node(&self) -> Result<bool> { | ||||||
self.in_memory.exists(MAINTENANCE_KEY).await | ||||||
} | ||||||
} | ||||||
|
||||||
/// A "Node" in the state machine of region failover procedure. | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ mod health; | |
mod heartbeat; | ||
mod inactive_regions; | ||
mod leader; | ||
mod maintenance; | ||
mod meta; | ||
mod node_lease; | ||
mod route; | ||
|
@@ -105,6 +106,14 @@ pub fn make_admin_service(meta_srv: MetaSrv) -> Admin { | |
}, | ||
); | ||
|
||
let router = router.route( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we provide an API to query whether we are in maintenance mode? |
||
"/set-maintenance", | ||
maintenance::MaintenanceHandler { | ||
kv_store: meta_srv.kv_store().clone(), | ||
in_memory: meta_srv.in_memory().clone(), | ||
}, | ||
); | ||
|
||
let router = Router::nest("/admin", router); | ||
|
||
Admin::new(router) | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,64 @@ | ||||||||||||
// Copyright 2023 Greptime Team | ||||||||||||
// | ||||||||||||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||||
// you may not use this file except in compliance with the License. | ||||||||||||
// You may obtain a copy of the License at | ||||||||||||
// | ||||||||||||
// http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||
// | ||||||||||||
// Unless required by applicable law or agreed to in writing, software | ||||||||||||
// distributed under the License is distributed on an "AS IS" BASIS, | ||||||||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||||||||
// See the License for the specific language governing permissions and | ||||||||||||
// limitations under the License. | ||||||||||||
|
||||||||||||
use std::collections::HashMap; | ||||||||||||
|
||||||||||||
use common_meta::key::MAINTENANCE_KEY; | ||||||||||||
use common_meta::rpc::store::PutRequest; | ||||||||||||
use snafu::{OptionExt, ResultExt}; | ||||||||||||
use tonic::codegen::http; | ||||||||||||
|
||||||||||||
use crate::error::{self, Result}; | ||||||||||||
use crate::service::admin::HttpHandler; | ||||||||||||
use crate::service::store::kv::{KvStoreRef, ResettableKvStoreRef}; | ||||||||||||
pub struct MaintenanceHandler { | ||||||||||||
Comment on lines
+24
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||
pub kv_store: KvStoreRef, | ||||||||||||
pub in_memory: ResettableKvStoreRef, | ||||||||||||
} | ||||||||||||
|
||||||||||||
#[async_trait::async_trait] | ||||||||||||
impl HttpHandler for MaintenanceHandler { | ||||||||||||
async fn handle( | ||||||||||||
&self, | ||||||||||||
_: &str, | ||||||||||||
params: &HashMap<String, String>, | ||||||||||||
) -> Result<http::Response<String>> { | ||||||||||||
let switch_on = params | ||||||||||||
.get("switch_on") | ||||||||||||
.map(|on| on.parse::<bool>()) | ||||||||||||
.context(error::MissingRequiredParameterSnafu { param: "switch-on" })? | ||||||||||||
.context(error::ParseBoolSnafu { | ||||||||||||
err_msg: "`switch_on` is not a valid bool", | ||||||||||||
})?; | ||||||||||||
|
||||||||||||
let req = PutRequest { | ||||||||||||
key: Vec::from(MAINTENANCE_KEY), | ||||||||||||
value: vec![], | ||||||||||||
prev_kv: false, | ||||||||||||
}; | ||||||||||||
|
||||||||||||
if switch_on { | ||||||||||||
self.kv_store.put(req.clone()).await?; | ||||||||||||
self.in_memory.put(req).await?; | ||||||||||||
} else { | ||||||||||||
self.kv_store.delete(MAINTENANCE_KEY, false).await?; | ||||||||||||
self.in_memory.delete(MAINTENANCE_KEY, false).await?; | ||||||||||||
} | ||||||||||||
|
||||||||||||
http::Response::builder() | ||||||||||||
.status(http::StatusCode::OK) | ||||||||||||
.body("metasvc is succeed to be set maintenance mode".to_string()) | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If switch_on = false, this body content is untrue. |
||||||||||||
.context(error::InvalidHttpBodySnafu) | ||||||||||||
} | ||||||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: Are they related to the test?
Only the following assertion is related to the maintenance mode.