Merge pull request #341 from ComputeCanada/general_specs

Generalize definition of instance's specs
ComputeCanada · Jan 15, 2025 · bfd92ea · bfd92ea
2 parents e8a6582 + ee8df9a
commit bfd92ea
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 21 deletions.
diff --git a/aws/infrastructure.tf b/aws/infrastructure.tf
@@ -163,13 +163,11 @@ locals {
       local_ip    = aws_network_interface.nic[x].private_ip
       prefix      = values.prefix
       tags        = values.tags
-      specs = {
+      specs = merge({
         cpus   = data.aws_ec2_instance_type.instance_type[values.prefix].default_vcpus
         ram    = data.aws_ec2_instance_type.instance_type[values.prefix].memory_size
         gpus   = try(one(data.aws_ec2_instance_type.instance_type[values.prefix].gpus).count, 0)
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {

diff --git a/azure/infrastructure.tf b/azure/infrastructure.tf
@@ -155,13 +155,11 @@ locals {
       local_ip  = azurerm_network_interface.nic[x].private_ip_address
       prefix    = values.prefix
       tags      = values.tags
-      specs = {
+      specs = merge({
         cpus   = local.vmsizes[values.type].vcpus
         ram    = local.vmsizes[values.type].ram
         gpus   = local.vmsizes[values.type].gpus
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {

diff --git a/common/design/main.tf b/common/design/main.tf
@@ -10,8 +10,11 @@ locals {
       for prefix, attrs in var.instances : [
         for i in range(lookup(attrs, "count", 1)) : {
           (format("%s%d", prefix, i + 1)) = merge(
-            { for attr, value in attrs : attr => value if attr != "count" },
-            { prefix = prefix }
+            { for attr, value in attrs : attr => value if ! contains(["count"], attr) },
+            {
+              prefix = prefix,
+              specs = { for attr, value in attrs : attr => value if ! contains(["count", "tags", "image"], attr) }
+            },
           )
         }
       ]

diff --git a/docs/README.md b/docs/README.md
@@ -548,6 +548,13 @@ the operating system and service software
     and with x86-64 processors (see [NVIDIA/mig-parted issue #30](https://github.com/NVIDIA/mig-parted/issues/30)).
 6. `shard`: total number of [Sharding](https://slurm.schedmd.com/gres.html#Sharding) on the node. Sharding allows sharing the same GPU on multiple jobs. The total number of shards is evenly distributed across all GPUs on the node.
 
+The instance specifications are retrieved from the cloud provider data source, but it is possible to explicitly specify them.
+
+7. `cpus`: number of logical processors on the node - [`CPUs` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_CPUs)
+8. `ram`: size of real memory on the node in megabyte - [`RealMemory` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_RealMemory)
+9. `gpus`: number of graphical processor on the node - [`Gres=gpu:<gpus>` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1)
+10. `gpu_type`: type of graphical processor on the node - [`Gres=gpu:<gpu_type>:<gpus>` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1)
+
 For some cloud providers, it possible to define additional attributes.
 The following sections present the available attributes per provider.
 
@@ -585,7 +592,7 @@ For more information on these attributes, refer to
 - `gpu_type`: name of the GPU model to attach to the instance. Refer to
 [Google Cloud documentation](https://cloud.google.com/compute/docs/gpus) for the list of
 available models per region
-- `gpu_count`: number of GPUs of the `gpu_type` model to attach to the instance
+- `gpus`: number of GPUs of the `gpu_type` model to attach to the instance
 
 #### 4.7.3 Post build modification effect
 

diff --git a/examples/gcp/main.tf b/examples/gcp/main.tf
@@ -24,8 +24,8 @@ module "gcp" {
       type = "n1-standard-2",
       tags = ["node"],
       count = 1,
+      gpus = 1
       gpu_type = "nvidia-tesla-t4",
-      gpu_count = 1
     }
   }
 

diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf
@@ -165,13 +165,11 @@ locals {
       local_ip  = google_compute_address.nic[x].address
       prefix    = values.prefix
       tags      = values.tags
-      specs = {
+      specs = merge({
         cpus   = data.external.machine_type[values["prefix"]].result["vcpus"]
         ram    = data.external.machine_type[values["prefix"]].result["ram"]
-        gpus   = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0))
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+        gpus   = try(data.external.machine_type[values["prefix"]].result["gpus"], 0)
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {

diff --git a/openstack/infrastructure.tf b/openstack/infrastructure.tf
@@ -120,16 +120,14 @@ locals {
       local_ip  = openstack_networking_port_v2.nic[x].all_fixed_ips[0]
       prefix    = values.prefix
       tags      = values.tags
-      specs = {
+      specs = merge({
         cpus   = data.openstack_compute_flavor_v2.flavors[values.prefix].vcpus
         ram    = data.openstack_compute_flavor_v2.flavors[values.prefix].ram
         gpus   = sum([
           parseint(lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "resources:VGPU", "0"), 10),
           parseint(split(":", lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "pci_passthrough:alias", "gpu:0"))[1], 10)
         ])
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {