From bafa2247ae6f38721cb99fa2100b37e2c2ae6371 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 20 Dec 2024 11:26:26 -0800 Subject: [PATCH 1/3] add integration test --- nexus/tests/integration_tests/certificates.rs | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/nexus/tests/integration_tests/certificates.rs b/nexus/tests/integration_tests/certificates.rs index ea3979c7b9..7a164f76da 100644 --- a/nexus/tests/integration_tests/certificates.rs +++ b/nexus/tests/integration_tests/certificates.rs @@ -627,6 +627,116 @@ async fn test_silo_certificates() { ); } + // Verify that all this works with a larger number of TLS certificates. + // First, we'll create the silos. + let before = std::time::Instant::now(); + let silos: Vec<_> = (0..NSILOS) + .into_iter() + .map(|i| SiloCert::new(format!("extra-silo-{}", i).parse().unwrap())) + .collect(); + + const NSILOS: usize = 250; + for silo_i in &silos { + let silo_i_cert = oxide_client::types::CertificateCreate::builder() + .name(silo_i.cert_name.clone()) + .description("") + .cert(silo_i.cert.cert.clone()) + .key(silo_i.cert.key.clone()) + .service(oxide_client::types::ServiceUsingCertificate::ExternalApi); + silo1_client + .silo_create() + .body( + oxide_client::types::SiloCreate::builder() + .name(silo_i.silo_name.clone()) + .description("") + .discoverable(false) + .quotas(oxide_client::types::SiloQuotasCreate { + cpus: 0, + memory: oxide_client::types::ByteCount(0), + storage: oxide_client::types::ByteCount(0), + }) + .identity_mode( + oxide_client::types::SiloIdentityMode::LocalOnly, + ) + .tls_certificates(vec![silo_i_cert.try_into().unwrap()]), + ) + .send() + .await + .expect("failed to create Silo"); + } + + // Now create a user in each silo. + let silo_users: Vec<_> = futures::future::try_join_all( + silos + .iter() + .map(|silo_i| { + silo1_client + .local_idp_user_create() + .silo(silo_i.silo_name.clone()) + .body( + oxide_client::types::UserCreate::builder() + .external_id("testuser") + .password( + oxide_client::types::UserPassword::LoginDisallowed, + ), + ) + .send() + }) + .collect::>(), + ) + .await + .expect("failed to create user") + .into_iter() + .map(|u| u.into_inner().id) + .collect(); + + // Now try to access each silo using its own TLS certificate. + // + // This looks potentially _super_ time-consuming because we're waiting for + // 200 conditions to become true. In practice, however, each activation of + // the background task will pick up all newly-created silos. So in total + // across all loop iterations, we're just waiting for one round of the + // external_endpoints background task to complete that started after we + // created all the silos above. So most of the wait_for_condition()s ought + // to complete immediately. + for (silo_i, silo_i_user) in silos.iter().zip(silo_users.iter()) { + let silo_i_client = silo_i.oxide_client( + silo_i.reqwest_client(), + resolver.clone(), + AuthnMode::SiloUser(*silo_i_user), + nexus_port, + ); + + wait_for_condition( + || async { + match silo_i_client.current_user_view().send().await { + Ok(result) => { + assert_eq!(result.into_inner().id, *silo_i_user); + Ok(()) + } + Err(oxide_client::Error::CommunicationError(error)) + if error.is_connect() => + { + Err(CondCheckError::NotYet) + } + Err(e) => Err(CondCheckError::Failed(e)), + } + }, + &Duration::from_millis(50), + &Duration::from_secs(30), + ) + .await + .unwrap_or_else(|error| { + panic!( + "failed to connect to silo {}'s endpoint within timeout: {:#}", + &silo_i.silo_name.as_str(), + error + ) + }); + } + + println!("scale part took {:?}", before.elapsed()); + cptestctx.teardown().await; } From f138943f5856973724bd436c8cee76e155cfc177 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 20 Dec 2024 10:48:18 -0800 Subject: [PATCH 2/3] external_endpoints should support more than 200 silos and TLS certificates --- nexus/src/app/external_endpoints.rs | 114 +++++++++++++--------------- 1 file changed, 54 insertions(+), 60 deletions(-) diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs index f837edc4fb..b93b692465 100644 --- a/nexus/src/app/external_endpoints.rs +++ b/nexus/src/app/external_endpoints.rs @@ -33,15 +33,17 @@ use anyhow::Context; use nexus_db_model::AuthenticationMode; use nexus_db_model::Certificate; use nexus_db_model::DnsGroup; +use nexus_db_model::DnsZone; +use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::Discoverability; use nexus_db_queries::db::model::ServiceKind; +use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; use nexus_types::identity::Resource; use nexus_types::silo::silo_dns_name; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_common::api::external::http_pagination::PaginatedBy; -use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::bail_unless; use openssl::pkey::PKey; @@ -488,69 +490,61 @@ pub(crate) async fn read_all_endpoints( datastore: &DataStore, opctx: &OpContext, ) -> Result { - // We will not look for more than this number of external DNS zones, Silos, - // or certificates. We do not expect very many of any of these objects. - const MAX: u32 = 200; - let pagparams_id = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - let pagbyid = PaginatedBy::Id(pagparams_id); - let pagparams_name = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - - let silos = - datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?; - let external_dns_zones = datastore - .dns_zones_list(opctx, DnsGroup::External, &pagparams_name) - .await?; + // The batch size here is pretty arbitrary. On the vast majority of + // systems, there will only ever be a handful of any of these objects. Some + // systems are known to have a few dozen silos and a few hundred TLS + // certificates. This code path is not particularly latency-sensitive. Our + // purpose in limiting the batch size is just to avoid unbounded-size + // database transactions. + // + // unwrap(): safe because 200 is non-zero. + let batch_size = NonZeroU32::new(200).unwrap(); + + // Fetch all silos. + let mut silos = Vec::new(); + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = datastore + .silos_list( + opctx, + &PaginatedBy::Id(p.current_pagparams()), + Discoverability::All, + ) + .await?; + paginator = p.found_batch(&batch, &|s: &Silo| s.id()); + silos.extend(batch.into_iter()); + } + + // Fetch all external DNS zones. We should really only ever have one, but + // we may as well paginate this. + let mut external_dns_zones = Vec::new(); + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = datastore + .dns_zones_list(opctx, DnsGroup::External, &p.current_pagparams()) + .await?; + paginator = p.found_batch(&batch, &|z: &DnsZone| z.zone_name.clone()); + external_dns_zones.extend(batch.into_iter()); + } bail_unless!( !external_dns_zones.is_empty(), "expected at least one external DNS zone" ); - let certs = datastore - .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false) - .await?; - - // If we found too many of any of these things, complain as loudly as we - // can. Our results will be wrong. But we still don't want to fail if we - // can avoid it because we want to be able to serve as many endpoints as we - // can. - // TODO-reliability we should prevent people from creating more than this - // maximum number of Silos and certificates. - let max = usize::try_from(MAX).unwrap(); - if silos.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} silos, but found at \ - least {}. TLS may not work on some Silos' external endpoints.", - MAX, - silos.len(), - ); - } - if external_dns_zones.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} external DNS zones, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - external_dns_zones.len(), - ); - } - if certs.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} certificates, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - certs.len(), - ); + + // Fetch all TLS certificates. + let mut certs = Vec::new(); + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = datastore + .certificate_list_for( + opctx, + Some(ServiceKind::Nexus), + &PaginatedBy::Id(p.current_pagparams()), + false, + ) + .await?; + paginator = p.found_batch(&batch, &|s: &Certificate| s.id()); + certs.extend(batch); } Ok(ExternalEndpoints::new(silos, certs, external_dns_zones)) From 5e4153b5697a135cd51a49210fa9c926f8eae90a Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 20 Dec 2024 11:36:10 -0800 Subject: [PATCH 3/3] Revert "add integration test" This reverts commit bafa2247ae6f38721cb99fa2100b37e2c2ae6371. --- nexus/tests/integration_tests/certificates.rs | 110 ------------------ 1 file changed, 110 deletions(-) diff --git a/nexus/tests/integration_tests/certificates.rs b/nexus/tests/integration_tests/certificates.rs index 7a164f76da..ea3979c7b9 100644 --- a/nexus/tests/integration_tests/certificates.rs +++ b/nexus/tests/integration_tests/certificates.rs @@ -627,116 +627,6 @@ async fn test_silo_certificates() { ); } - // Verify that all this works with a larger number of TLS certificates. - // First, we'll create the silos. - let before = std::time::Instant::now(); - let silos: Vec<_> = (0..NSILOS) - .into_iter() - .map(|i| SiloCert::new(format!("extra-silo-{}", i).parse().unwrap())) - .collect(); - - const NSILOS: usize = 250; - for silo_i in &silos { - let silo_i_cert = oxide_client::types::CertificateCreate::builder() - .name(silo_i.cert_name.clone()) - .description("") - .cert(silo_i.cert.cert.clone()) - .key(silo_i.cert.key.clone()) - .service(oxide_client::types::ServiceUsingCertificate::ExternalApi); - silo1_client - .silo_create() - .body( - oxide_client::types::SiloCreate::builder() - .name(silo_i.silo_name.clone()) - .description("") - .discoverable(false) - .quotas(oxide_client::types::SiloQuotasCreate { - cpus: 0, - memory: oxide_client::types::ByteCount(0), - storage: oxide_client::types::ByteCount(0), - }) - .identity_mode( - oxide_client::types::SiloIdentityMode::LocalOnly, - ) - .tls_certificates(vec![silo_i_cert.try_into().unwrap()]), - ) - .send() - .await - .expect("failed to create Silo"); - } - - // Now create a user in each silo. - let silo_users: Vec<_> = futures::future::try_join_all( - silos - .iter() - .map(|silo_i| { - silo1_client - .local_idp_user_create() - .silo(silo_i.silo_name.clone()) - .body( - oxide_client::types::UserCreate::builder() - .external_id("testuser") - .password( - oxide_client::types::UserPassword::LoginDisallowed, - ), - ) - .send() - }) - .collect::>(), - ) - .await - .expect("failed to create user") - .into_iter() - .map(|u| u.into_inner().id) - .collect(); - - // Now try to access each silo using its own TLS certificate. - // - // This looks potentially _super_ time-consuming because we're waiting for - // 200 conditions to become true. In practice, however, each activation of - // the background task will pick up all newly-created silos. So in total - // across all loop iterations, we're just waiting for one round of the - // external_endpoints background task to complete that started after we - // created all the silos above. So most of the wait_for_condition()s ought - // to complete immediately. - for (silo_i, silo_i_user) in silos.iter().zip(silo_users.iter()) { - let silo_i_client = silo_i.oxide_client( - silo_i.reqwest_client(), - resolver.clone(), - AuthnMode::SiloUser(*silo_i_user), - nexus_port, - ); - - wait_for_condition( - || async { - match silo_i_client.current_user_view().send().await { - Ok(result) => { - assert_eq!(result.into_inner().id, *silo_i_user); - Ok(()) - } - Err(oxide_client::Error::CommunicationError(error)) - if error.is_connect() => - { - Err(CondCheckError::NotYet) - } - Err(e) => Err(CondCheckError::Failed(e)), - } - }, - &Duration::from_millis(50), - &Duration::from_secs(30), - ) - .await - .unwrap_or_else(|error| { - panic!( - "failed to connect to silo {}'s endpoint within timeout: {:#}", - &silo_i.silo_name.as_str(), - error - ) - }); - } - - println!("scale part took {:?}", before.elapsed()); - cptestctx.teardown().await; }