From c9016e1de5e5383632052d7ef83856c77c66c9fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 15 Jan 2025 10:07:45 -0500 Subject: [PATCH 1/2] Generalize definition of instance's specs --- aws/infrastructure.tf | 6 ++---- azure/infrastructure.tf | 6 ++---- common/design/main.tf | 7 +++++-- gcp/infrastructure.tf | 6 ++---- openstack/infrastructure.tf | 6 ++---- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/aws/infrastructure.tf b/aws/infrastructure.tf index 60443dd3..0a567fb7 100644 --- a/aws/infrastructure.tf +++ b/aws/infrastructure.tf @@ -163,13 +163,11 @@ locals { local_ip = aws_network_interface.nic[x].private_ip prefix = values.prefix tags = values.tags - specs = { + specs = merge({ cpus = data.aws_ec2_instance_type.instance_type[values.prefix].default_vcpus ram = data.aws_ec2_instance_type.instance_type[values.prefix].memory_size gpus = try(one(data.aws_ec2_instance_type.instance_type[values.prefix].gpus).count, 0) - mig = lookup(values, "mig", null) - shard = lookup(values, "shard", null) - } + }, values.specs) volumes = contains(keys(module.design.volume_per_instance), x) ? { for pv_key, pv_values in var.volumes: pv_key => { diff --git a/azure/infrastructure.tf b/azure/infrastructure.tf index 0d1f9ff1..c7a3f832 100644 --- a/azure/infrastructure.tf +++ b/azure/infrastructure.tf @@ -155,13 +155,11 @@ locals { local_ip = azurerm_network_interface.nic[x].private_ip_address prefix = values.prefix tags = values.tags - specs = { + specs = merge({ cpus = local.vmsizes[values.type].vcpus ram = local.vmsizes[values.type].ram gpus = local.vmsizes[values.type].gpus - mig = lookup(values, "mig", null) - shard = lookup(values, "shard", null) - } + }, values.specs) volumes = contains(keys(module.design.volume_per_instance), x) ? { for pv_key, pv_values in var.volumes: pv_key => { diff --git a/common/design/main.tf b/common/design/main.tf index 3834706f..acb71ade 100644 --- a/common/design/main.tf +++ b/common/design/main.tf @@ -10,8 +10,11 @@ locals { for prefix, attrs in var.instances : [ for i in range(lookup(attrs, "count", 1)) : { (format("%s%d", prefix, i + 1)) = merge( - { for attr, value in attrs : attr => value if attr != "count" }, - { prefix = prefix } + { for attr, value in attrs : attr => value if ! contains(["count"], attr) }, + { + prefix = prefix, + specs = { for attr, value in attrs : attr => value if ! contains(["count", "tags", "image"], attr) } + }, ) } ] diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf index 8dc7f9d5..d2d04560 100644 --- a/gcp/infrastructure.tf +++ b/gcp/infrastructure.tf @@ -165,13 +165,11 @@ locals { local_ip = google_compute_address.nic[x].address prefix = values.prefix tags = values.tags - specs = { + specs = merge({ cpus = data.external.machine_type[values["prefix"]].result["vcpus"] ram = data.external.machine_type[values["prefix"]].result["ram"] gpus = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0)) - mig = lookup(values, "mig", null) - shard = lookup(values, "shard", null) - } + }, values.specs) volumes = contains(keys(module.design.volume_per_instance), x) ? { for pv_key, pv_values in var.volumes: pv_key => { diff --git a/openstack/infrastructure.tf b/openstack/infrastructure.tf index ecd79399..7644ff98 100644 --- a/openstack/infrastructure.tf +++ b/openstack/infrastructure.tf @@ -120,16 +120,14 @@ locals { local_ip = openstack_networking_port_v2.nic[x].all_fixed_ips[0] prefix = values.prefix tags = values.tags - specs = { + specs = merge({ cpus = data.openstack_compute_flavor_v2.flavors[values.prefix].vcpus ram = data.openstack_compute_flavor_v2.flavors[values.prefix].ram gpus = sum([ parseint(lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "resources:VGPU", "0"), 10), parseint(split(":", lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "pci_passthrough:alias", "gpu:0"))[1], 10) ]) - mig = lookup(values, "mig", null) - shard = lookup(values, "shard", null) - } + }, values.specs) volumes = contains(keys(module.design.volume_per_instance), x) ? { for pv_key, pv_values in var.volumes: pv_key => { From ee8df9a95b855717971488e62c7c3442a46c0ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 15 Jan 2025 11:37:32 -0500 Subject: [PATCH 2/2] Update docs --- docs/README.md | 9 ++++++++- examples/gcp/main.tf | 2 +- gcp/infrastructure.tf | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/README.md b/docs/README.md index b9adaa38..4c9608ab 100644 --- a/docs/README.md +++ b/docs/README.md @@ -548,6 +548,13 @@ the operating system and service software and with x86-64 processors (see [NVIDIA/mig-parted issue #30](https://github.com/NVIDIA/mig-parted/issues/30)). 6. `shard`: total number of [Sharding](https://slurm.schedmd.com/gres.html#Sharding) on the node. Sharding allows sharing the same GPU on multiple jobs. The total number of shards is evenly distributed across all GPUs on the node. +The instance specifications are retrieved from the cloud provider data source, but it is possible to explicitly specify them. + +7. `cpus`: number of logical processors on the node - [`CPUs` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_CPUs) +8. `ram`: size of real memory on the node in megabyte - [`RealMemory` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_RealMemory) +9. `gpus`: number of graphical processor on the node - [`Gres=gpu:` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) +10. `gpu_type`: type of graphical processor on the node - [`Gres=gpu::` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) + For some cloud providers, it possible to define additional attributes. The following sections present the available attributes per provider. @@ -585,7 +592,7 @@ For more information on these attributes, refer to - `gpu_type`: name of the GPU model to attach to the instance. Refer to [Google Cloud documentation](https://cloud.google.com/compute/docs/gpus) for the list of available models per region -- `gpu_count`: number of GPUs of the `gpu_type` model to attach to the instance +- `gpus`: number of GPUs of the `gpu_type` model to attach to the instance #### 4.7.3 Post build modification effect diff --git a/examples/gcp/main.tf b/examples/gcp/main.tf index df15007a..2b2ca4ac 100644 --- a/examples/gcp/main.tf +++ b/examples/gcp/main.tf @@ -24,8 +24,8 @@ module "gcp" { type = "n1-standard-2", tags = ["node"], count = 1, + gpus = 1 gpu_type = "nvidia-tesla-t4", - gpu_count = 1 } } diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf index d2d04560..575871be 100644 --- a/gcp/infrastructure.tf +++ b/gcp/infrastructure.tf @@ -168,7 +168,7 @@ locals { specs = merge({ cpus = data.external.machine_type[values["prefix"]].result["vcpus"] ram = data.external.machine_type[values["prefix"]].result["ram"] - gpus = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0)) + gpus = try(data.external.machine_type[values["prefix"]].result["gpus"], 0) }, values.specs) volumes = contains(keys(module.design.volume_per_instance), x) ? { for pv_key, pv_values in var.volumes: