Skip to content

Commit

Permalink
Merge branch 'main' into john/blueprint-store-sled-state
Browse files Browse the repository at this point in the history
  • Loading branch information
jgallagher committed May 13, 2024
2 parents d3e2c1a + 3641a15 commit 615ef08
Show file tree
Hide file tree
Showing 39 changed files with 1,343 additions and 44 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/hakari.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ jobs:
env:
RUSTFLAGS: -D warnings
steps:
- uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b # v4
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
with:
ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461
- uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1
with:
toolchain: stable
- name: Install cargo-hakari
uses: taiki-e/install-action@33429073072b7cebf2bbb9807209561b3d1decf7 # v2
uses: taiki-e/install-action@0eaa33a7adfb219fec683c2480a4eb8c79bfeff1 # v2
with:
tool: cargo-hakari
- name: Check workspace-hack Cargo.toml is up-to-date
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

93 changes: 88 additions & 5 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -889,8 +889,95 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
);
}
};
} else if name == "instance_watcher" {
#[derive(Deserialize)]
struct TaskSuccess {
/// total number of instances checked
total_instances: usize,

/// number of stale instance metrics that were deleted
pruned_instances: usize,

/// instance states from completed checks.
///
/// this is a mapping of stringified instance states to the number
/// of instances in that state. these stringified states correspond
/// to the `state` field recorded by the instance watcher's
/// `virtual_machine:check` timeseries with the `healthy` field set
/// to `true`. any changes to the instance state type which cause it
/// to print differently will be counted as a distinct state.
instance_states: BTreeMap<String, usize>,

/// instance check failures.
///
/// this is a mapping of stringified instance check failure reasons
/// to the number of instances with checks that failed for that
/// reason. these stringified failure reasons correspond to the
/// `state` field recorded by the instance watcher's
/// `virtual_machine:check` timeseries with the `healthy` field set
/// to `false`. any changes to the instance state type which cause
/// it to print differently will be counted as a distinct failure
/// reason.
failed_checks: BTreeMap<String, usize>,

/// instance checks that could not be completed successfully.
///
/// this is a mapping of stringified instance check errors
/// to the number of instance checks that were not completed due to
/// that error. these stringified errors correspond to the `reason `
/// field recorded by the instance watcher's
/// `virtual_machine:incomplete_check` timeseries. any changes to
/// the check error type which cause it to print
/// differently will be counted as a distinct check error.
incomplete_checks: BTreeMap<String, usize>,
}

match serde_json::from_value::<TaskSuccess>(details.clone()) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(TaskSuccess {
total_instances,
pruned_instances,
instance_states,
failed_checks,
incomplete_checks,
}) => {
let total_successes: usize = instance_states.values().sum();
let total_failures: usize = failed_checks.values().sum();
let total_incomplete: usize = incomplete_checks.values().sum();
println!(" total instances checked: {total_instances}",);
println!(
" checks completed: {}",
total_successes + total_failures
);
println!(" successful checks: {total_successes}",);
for (state, count) in &instance_states {
println!(" -> {count} instances {state}")
}

println!(" failed checks: {total_failures}");
for (failure, count) in &failed_checks {
println!(" -> {count} {failure}")
}
println!(
" checks that could not be completed: {total_incomplete}",
);
for (error, count) in &incomplete_checks {
println!(" -> {count} {error} errors")
}
println!(
" stale instance metrics pruned: {pruned_instances}"
);
}
};
} else if name == "service_firewall_rule_propagation" {
match serde_json::from_value::<serde_json::Value>(details.clone()) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(serde_json::Value::Object(map)) => {
if !map.is_empty() {
eprintln!(
Expand All @@ -902,11 +989,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
Ok(val) => {
eprintln!(" unexpected return value from task: {:?}", val)
}
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
}
};
} else {
println!(
"warning: unknown background task: {:?} \
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ task: "external_endpoints"
on each one


task: "instance_watcher"
periodically checks instance states


task: "inventory_collection"
collects hardware and software inventory data from the whole system

Expand Down Expand Up @@ -179,6 +183,10 @@ task: "external_endpoints"
on each one


task: "instance_watcher"
periodically checks instance states


task: "inventory_collection"
collects hardware and software inventory data from the whole system

Expand Down Expand Up @@ -273,6 +281,10 @@ task: "external_endpoints"
on each one


task: "instance_watcher"
periodically checks instance states


task: "inventory_collection"
collects hardware and software inventory data from the whole system

Expand Down
16 changes: 16 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ task: "external_endpoints"
on each one


task: "instance_watcher"
periodically checks instance states


task: "inventory_collection"
collects hardware and software inventory data from the whole system

Expand Down Expand Up @@ -396,6 +400,18 @@ task: "external_endpoints"

TLS certificates: 0

task: "instance_watcher"
configured period: every 30s
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total instances checked: 0
checks completed: 0
successful checks: 0
failed checks: 0
checks that could not be completed: 0
stale instance metrics pruned: 0

task: "inventory_collection"
configured period: every 10m
currently executing: no
Expand Down
15 changes: 15 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ pub struct BackgroundTaskConfig {
pub switch_port_settings_manager: SwitchPortSettingsManagerConfig,
/// configuration for region replacement task
pub region_replacement: RegionReplacementConfig,
/// configuration for instance watcher task
pub instance_watcher: InstanceWatcherConfig,
/// configuration for service VPC firewall propagation task
pub service_firewall_propagation: ServiceFirewallPropagationConfig,
}
Expand Down Expand Up @@ -521,6 +523,14 @@ pub struct RegionReplacementConfig {
pub period_secs: Duration,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct InstanceWatcherConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct ServiceFirewallPropagationConfig {
Expand Down Expand Up @@ -765,6 +775,7 @@ mod test {
sync_service_zone_nat.period_secs = 30
switch_port_settings_manager.period_secs = 30
region_replacement.period_secs = 30
instance_watcher.period_secs = 30
service_firewall_propagation.period_secs = 300
[default_region_allocation_strategy]
type = "random"
Expand Down Expand Up @@ -894,6 +905,9 @@ mod test {
region_replacement: RegionReplacementConfig {
period_secs: Duration::from_secs(30),
},
instance_watcher: InstanceWatcherConfig {
period_secs: Duration::from_secs(30),
},
service_firewall_propagation:
ServiceFirewallPropagationConfig {
period_secs: Duration::from_secs(300),
Expand Down Expand Up @@ -964,6 +978,7 @@ mod test {
sync_service_zone_nat.period_secs = 30
switch_port_settings_manager.period_secs = 30
region_replacement.period_secs = 30
instance_watcher.period_secs = 30
service_firewall_propagation.period_secs = 300
[default_region_allocation_strategy]
type = "random"
Expand Down
1 change: 1 addition & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ itertools.workspace = true
macaddr.workspace = true
# Not under "dev-dependencies"; these also need to be implemented for
# integration tests.
nexus-client.workspace = true
nexus-config.workspace = true
nexus-networking.workspace = true
nexus-test-interface.workspace = true
Expand Down
1 change: 1 addition & 0 deletions nexus/db-model/src/external_ip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ impl From<FloatingIp> for views::FloatingIp {

views::FloatingIp {
ip: ip.ip.ip(),
ip_pool_id: ip.ip_pool_id,
identity,
project_id: ip.project_id,
instance_id: ip.parent_id,
Expand Down
2 changes: 2 additions & 0 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1779,3 +1779,5 @@ allow_tables_to_appear_in_same_query!(volume, virtual_provisioning_resource);
allow_tables_to_appear_in_same_query!(ssh_key, instance_ssh_key, instance);
joinable!(instance_ssh_key -> ssh_key (ssh_key_id));
joinable!(instance_ssh_key -> instance (instance_id));

allow_tables_to_appear_in_same_query!(sled, sled_instance);
5 changes: 3 additions & 2 deletions nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(60, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(61, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,7 +29,8 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(60, "blueprint-add-sled-state"),
KnownVersion::new(61, "blueprint-add-sled-state"),
KnownVersion::new(60, "add-lookup-vmm-by-sled-id-index"),
KnownVersion::new(59, "enforce-first-as-default"),
KnownVersion::new(58, "insert-default-allowlist"),
KnownVersion::new(57, "add-allowed-source-ips"),
Expand Down
6 changes: 6 additions & 0 deletions nexus/db-model/src/sled_instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,9 @@ impl From<SledInstance> for views::SledInstance {
}
}
}

impl SledInstance {
pub fn instance_id(&self) -> Uuid {
self.identity.id
}
}
56 changes: 56 additions & 0 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,22 @@ use crate::db::model::Instance;
use crate::db::model::InstanceRuntimeState;
use crate::db::model::Name;
use crate::db::model::Project;
use crate::db::model::Sled;
use crate::db::model::Vmm;
use crate::db::pagination::paginated;
use crate::db::update_and_check::UpdateAndCheck;
use crate::db::update_and_check::UpdateStatus;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::Utc;
use diesel::prelude::*;
use nexus_db_model::ApplySledFilterExt;
use nexus_db_model::Disk;
use nexus_db_model::VmmRuntimeState;
use nexus_types::deployment::SledFilter;
use omicron_common::api;
use omicron_common::api::external::http_pagination::PaginatedBy;
use omicron_common::api::external::CreateResult;
use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::DeleteResult;
use omicron_common::api::external::Error;
use omicron_common::api::external::ListResultVec;
Expand Down Expand Up @@ -385,6 +389,58 @@ impl DataStore {
Ok((instance_updated, vmm_updated))
}

/// Lists all instances on in-service sleds with active Propolis VMM
/// processes, returning the instance along with the VMM on which it's
/// running, the sled on which the VMM is running, and the project that owns
/// the instance.
///
/// The query performed by this function is paginated by the sled's UUID.
pub async fn instance_and_vmm_list_by_sled_agent(
&self,
opctx: &OpContext,
pagparams: &DataPageParams<'_, Uuid>,
) -> ListResultVec<(Sled, Instance, Vmm, Project)> {
use crate::db::schema::{
instance::dsl as instance_dsl, project::dsl as project_dsl,
sled::dsl as sled_dsl, vmm::dsl as vmm_dsl,
};
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
let conn = self.pool_connection_authorized(opctx).await?;

let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)
.filter(sled_dsl::time_deleted.is_null())
.sled_filter(SledFilter::InService)
.inner_join(
vmm_dsl::vmm
.on(vmm_dsl::sled_id
.eq(sled_dsl::id)
.and(vmm_dsl::time_deleted.is_null()))
.inner_join(
instance_dsl::instance
.on(instance_dsl::id
.eq(vmm_dsl::instance_id)
.and(instance_dsl::time_deleted.is_null()))
.inner_join(
project_dsl::project.on(project_dsl::id
.eq(instance_dsl::project_id)
.and(project_dsl::time_deleted.is_null())),
),
),
)
.sled_filter(SledFilter::InService)
.select((
Sled::as_select(),
Instance::as_select(),
Vmm::as_select(),
Project::as_select(),
))
.load_async::<(Sled, Instance, Vmm, Project)>(&*conn)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;

Ok(result)
}

pub async fn project_delete_instance(
&self,
opctx: &OpContext,
Expand Down
2 changes: 2 additions & 0 deletions nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ blueprints.period_secs_execute = 60
sync_service_zone_nat.period_secs = 30
switch_port_settings_manager.period_secs = 30
region_replacement.period_secs = 30
# How frequently to query the status of active instances.
instance_watcher.period_secs = 30
service_firewall_propagation.period_secs = 300

[default_region_allocation_strategy]
Expand Down
Loading

0 comments on commit 615ef08

Please sign in to comment.