Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RandomnWithDistinctSleds region allocation strategy #3858

Merged
merged 13 commits into from
Oct 3, 2023
1 change: 1 addition & 0 deletions .github/buildomat/jobs/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ cd /opt/oxide/work

ptime -m tar xvzf /input/package/work/package.tar.gz
cp /input/package/work/zones/* out/
mv out/omicron-nexus-single-sled.tar.gz out/omicron-nexus.tar.gz
mkdir tests
for p in /input/ci-tools/work/end-to-end-tests/*.gz; do
ptime -m gunzip < "$p" > "tests/$(basename "${p%.gz}")"
Expand Down
9 changes: 7 additions & 2 deletions .github/buildomat/jobs/package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ ptime -m ./tools/ci_download_softnpu_machinery

# Build the test target
ptime -m cargo run --locked --release --bin omicron-package -- \
-t test target create -i standard -m non-gimlet -s softnpu
-t test target create -i standard -m non-gimlet -s softnpu -r single-sled
ptime -m cargo run --locked --release --bin omicron-package -- \
-t test package

Expand Down Expand Up @@ -81,9 +81,13 @@ stamp_packages() {
done
}

# Keep the single-sled Nexus zone around for the deploy job. (The global zone
# build below overwrites the file.)
mv out/omicron-nexus.tar.gz out/omicron-nexus-single-sled.tar.gz

# Build necessary for the global zone
ptime -m cargo run --locked --release --bin omicron-package -- \
-t host target create -i standard -m gimlet -s asic
-t host target create -i standard -m gimlet -s asic -r multi-sled
ptime -m cargo run --locked --release --bin omicron-package -- \
-t host package
stamp_packages omicron-sled-agent maghemite propolis-server overlay
Expand Down Expand Up @@ -111,6 +115,7 @@ zones=(
out/external-dns.tar.gz
out/internal-dns.tar.gz
out/omicron-nexus.tar.gz
out/omicron-nexus-single-sled.tar.gz
out/oximeter-collector.tar.gz
out/propolis-server.tar.gz
out/switch-*.tar.gz
Expand Down
3 changes: 2 additions & 1 deletion .github/buildomat/jobs/tuf-repo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ done
mkdir /work/package
pushd /work/package
tar xf /input/package/work/package.tar.gz out package-manifest.toml target/release/omicron-package
target/release/omicron-package -t default target create -i standard -m gimlet -s asic
target/release/omicron-package -t default target create -i standard -m gimlet -s asic -r multi-sled
ln -s /input/package/work/zones/* out/
rm out/switch-softnpu.tar.gz # not used when target switch=asic
rm out/omicron-gateway-softnpu.tar.gz # not used when target switch=asic
rm out/omicron-nexus-single-sled.tar.gz # only used for deploy tests
for zone in out/*.tar.gz; do
target/release/omicron-package stamp "$(basename "${zone%.tar.gz}")" "$VERSION"
done
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Install Pre-Requisites
run: ./tools/install_builder_prerequisites.sh -y
- name: Set default target
run: cargo run --bin omicron-package -- -t default target create
run: cargo run --bin omicron-package -- -t default target create -r single-sled
- name: Check build of deployed Omicron packages
run: cargo run --bin omicron-package -- -t default check

Expand Down
74 changes: 59 additions & 15 deletions common/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,8 @@ pub struct PackageConfig {
pub dendrite: HashMap<SwitchLocation, DpdConfig>,
/// Background task configuration
pub background_tasks: BackgroundTaskConfig,
/// Default Crucible region allocation strategy
pub default_region_allocation_strategy: RegionAllocationStrategy,
}

#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
Expand Down Expand Up @@ -594,6 +596,9 @@ mod test {
dns_external.period_secs_propagation = 7
dns_external.max_concurrent_server_updates = 8
external_endpoints.period_secs = 9
[default_region_allocation_strategy]
type = "random"
seed = 0
"##,
)
.unwrap();
Expand Down Expand Up @@ -677,6 +682,10 @@ mod test {
period_secs: Duration::from_secs(9),
}
},
default_region_allocation_strategy:
crate::nexus_config::RegionAllocationStrategy::Random {
seed: Some(0)
}
},
}
);
Expand Down Expand Up @@ -724,6 +733,8 @@ mod test {
dns_external.period_secs_propagation = 7
dns_external.max_concurrent_server_updates = 8
external_endpoints.period_secs = 9
[default_region_allocation_strategy]
type = "random"
"##,
)
.unwrap();
Expand Down Expand Up @@ -864,25 +875,31 @@ mod test {
struct DummyConfig {
deployment: DeploymentConfig,
}
let config_path = "../smf/nexus/config-partial.toml";
println!(
"checking {:?} with example deployment section added",
config_path
);
let mut contents = std::fs::read_to_string(config_path)
.expect("failed to read Nexus SMF config file");
contents.push_str(
"\n\n\n \
# !! content below added by test_repo_configs_are_valid()\n\
\n\n\n",
);
let example_deployment = toml::to_string_pretty(&DummyConfig {
deployment: example_config.deployment,
})
.unwrap();
contents.push_str(&example_deployment);
let _: Config = toml::from_str(&contents)
.expect("Nexus SMF config file is not valid");

let nexus_config_paths = [
"../smf/nexus/single-sled/config-partial.toml",
"../smf/nexus/multi-sled/config-partial.toml",
];
for config_path in nexus_config_paths {
println!(
"checking {:?} with example deployment section added",
config_path
);
let mut contents = std::fs::read_to_string(config_path)
.expect("failed to read Nexus SMF config file");
contents.push_str(
"\n\n\n \
# !! content below added by test_repo_configs_are_valid()\n\
\n\n\n",
);
contents.push_str(&example_deployment);
let _: Config = toml::from_str(&contents)
.expect("Nexus SMF config file is not valid");
}
}

#[test]
Expand All @@ -894,3 +911,30 @@ mod test {
);
}
}

/// Defines a strategy for choosing what physical disks to use when allocating
/// new crucible regions.
///
/// NOTE: More strategies can - and should! - be added.
///
/// See <https://rfd.shared.oxide.computer/rfd/0205> for a more
/// complete discussion.
///
/// Longer-term, we should consider:
/// - Storage size + remaining free space
/// - Sled placement of datasets
/// - What sort of loads we'd like to create (even split across all disks
/// may not be preferable, especially if maintenance is expected)
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum RegionAllocationStrategy {
/// Choose disks pseudo-randomly. An optional seed may be provided to make
/// the ordering deterministic, otherwise the current time in nanoseconds
/// will be used. Ordering is based on sorting the output of `md5(UUID of
/// candidate dataset + seed)`. The seed does not need to come from a
/// cryptographically secure source.
Random { seed: Option<u64> },

/// Like Random, but ensures that each region is allocated on its own sled.
RandomWithDistinctSleds { seed: Option<u64> },
}
34 changes: 28 additions & 6 deletions docs/how-to-run.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -321,20 +321,42 @@ Error: Creates a new build target, and sets it as "active"
Usage: omicron-package target create [OPTIONS]

Options:
-i, --image <IMAGE> [default: standard] [possible values: standard, trampoline]
-m, --machine <MACHINE> [possible values: gimlet, gimlet-standalone, non-gimlet]
-s, --switch <SWITCH> [possible values: asic, stub, softnpu]
-h, --help Print help (see more with '--help')
-i, --image <IMAGE>
[default: standard]

Possible values:
- standard: A typical host OS image
- trampoline: A recovery host OS image, intended to bootstrap a Standard image

-m, --machine <MACHINE>
Possible values:
- gimlet: Use sled agent configuration for a Gimlet
- gimlet-standalone: Use sled agent configuration for a Gimlet running in isolation
- non-gimlet: Use sled agent configuration for a device emulating a Gimlet

-s, --switch <SWITCH>
Possible values:
- asic: Use the "real" Dendrite, that attempts to interact with the Tofino
- stub: Use a "stub" Dendrite that does not require any real hardware
- softnpu: Use a "softnpu" Dendrite that uses the SoftNPU asic emulator

-r, --rack-topology <RACK_TOPOLOGY>
Possible values:
- multi-sled: Use configurations suitable for a multi-sled deployment, such as dogfood and production racks
- single-sled: Use configurations suitable for a single-sled deployment, such as CI and dev machines

-h, --help
Print help (see a summary with '-h')

----

To set up a build target for a non-Gimlet machine with simulated (but fully functional) external networking, you would run:

[source,console]
----
$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu
$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu -r single-sled
Finished release [optimized] target(s) in 0.66s
Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu`
Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu -r single-sled`
Created new build target 'default' and set it as active
----

Expand Down
1 change: 1 addition & 0 deletions installinator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@ tokio-stream.workspace = true
[features]
image-standard = []
image-trampoline = []
rack-topology-single-sled = []
22 changes: 22 additions & 0 deletions nexus/db-model/src/queries/region_allocation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ table! {
}
}

table! {
shuffled_candidate_datasets {
id -> Uuid,
pool_id -> Uuid,
}
}

table! {
candidate_regions {
id -> Uuid,
Expand Down Expand Up @@ -89,6 +96,19 @@ table! {
}
}

table! {
one_zpool_per_sled (pool_id) {
pool_id -> Uuid
}
}

table! {
one_dataset_per_zpool {
id -> Uuid,
pool_id -> Uuid
}
}

table! {
inserted_regions {
id -> Uuid,
Expand Down Expand Up @@ -141,6 +161,7 @@ diesel::allow_tables_to_appear_in_same_query!(
);

diesel::allow_tables_to_appear_in_same_query!(old_regions, dataset,);
diesel::allow_tables_to_appear_in_same_query!(old_regions, zpool,);

diesel::allow_tables_to_appear_in_same_query!(
inserted_regions,
Expand All @@ -149,6 +170,7 @@ diesel::allow_tables_to_appear_in_same_query!(

diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, dataset,);
diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, zpool,);
diesel::allow_tables_to_appear_in_same_query!(candidate_datasets, dataset);

// == Needed for random region allocation ==

Expand Down
Loading
Loading