From c9016e1de5e5383632052d7ef83856c77c66c9fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?=
 <felix-antoine.fortin@calculquebec.ca>
Date: Wed, 15 Jan 2025 10:07:45 -0500
Subject: [PATCH 1/2] Generalize definition of instance's specs

---
 aws/infrastructure.tf       | 6 ++----
 azure/infrastructure.tf     | 6 ++----
 common/design/main.tf       | 7 +++++--
 gcp/infrastructure.tf       | 6 ++----
 openstack/infrastructure.tf | 6 ++----
 5 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/aws/infrastructure.tf b/aws/infrastructure.tf
index 60443dd3..0a567fb7 100644
--- a/aws/infrastructure.tf
+++ b/aws/infrastructure.tf
@@ -163,13 +163,11 @@ locals {
       local_ip    = aws_network_interface.nic[x].private_ip
       prefix      = values.prefix
       tags        = values.tags
-      specs = {
+      specs = merge({
         cpus   = data.aws_ec2_instance_type.instance_type[values.prefix].default_vcpus
         ram    = data.aws_ec2_instance_type.instance_type[values.prefix].memory_size
         gpus   = try(one(data.aws_ec2_instance_type.instance_type[values.prefix].gpus).count, 0)
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {
diff --git a/azure/infrastructure.tf b/azure/infrastructure.tf
index 0d1f9ff1..c7a3f832 100644
--- a/azure/infrastructure.tf
+++ b/azure/infrastructure.tf
@@ -155,13 +155,11 @@ locals {
       local_ip  = azurerm_network_interface.nic[x].private_ip_address
       prefix    = values.prefix
       tags      = values.tags
-      specs = {
+      specs = merge({
         cpus   = local.vmsizes[values.type].vcpus
         ram    = local.vmsizes[values.type].ram
         gpus   = local.vmsizes[values.type].gpus
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {
diff --git a/common/design/main.tf b/common/design/main.tf
index 3834706f..acb71ade 100644
--- a/common/design/main.tf
+++ b/common/design/main.tf
@@ -10,8 +10,11 @@ locals {
       for prefix, attrs in var.instances : [
         for i in range(lookup(attrs, "count", 1)) : {
           (format("%s%d", prefix, i + 1)) = merge(
-            { for attr, value in attrs : attr => value if attr != "count" },
-            { prefix = prefix }
+            { for attr, value in attrs : attr => value if ! contains(["count"], attr) },
+            {
+              prefix = prefix,
+              specs = { for attr, value in attrs : attr => value if ! contains(["count", "tags", "image"], attr) }
+            },
           )
         }
       ]
diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf
index 8dc7f9d5..d2d04560 100644
--- a/gcp/infrastructure.tf
+++ b/gcp/infrastructure.tf
@@ -165,13 +165,11 @@ locals {
       local_ip  = google_compute_address.nic[x].address
       prefix    = values.prefix
       tags      = values.tags
-      specs = {
+      specs = merge({
         cpus   = data.external.machine_type[values["prefix"]].result["vcpus"]
         ram    = data.external.machine_type[values["prefix"]].result["ram"]
         gpus   = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0))
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {
diff --git a/openstack/infrastructure.tf b/openstack/infrastructure.tf
index ecd79399..7644ff98 100644
--- a/openstack/infrastructure.tf
+++ b/openstack/infrastructure.tf
@@ -120,16 +120,14 @@ locals {
       local_ip  = openstack_networking_port_v2.nic[x].all_fixed_ips[0]
       prefix    = values.prefix
       tags      = values.tags
-      specs = {
+      specs = merge({
         cpus   = data.openstack_compute_flavor_v2.flavors[values.prefix].vcpus
         ram    = data.openstack_compute_flavor_v2.flavors[values.prefix].ram
         gpus   = sum([
           parseint(lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "resources:VGPU", "0"), 10),
           parseint(split(":", lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "pci_passthrough:alias", "gpu:0"))[1], 10)
         ])
-        mig    = lookup(values, "mig", null)
-        shard  = lookup(values, "shard", null)
-      }
+      }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes:
           pv_key => {

From ee8df9a95b855717971488e62c7c3442a46c0ccb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?=
 <felix-antoine.fortin@calculquebec.ca>
Date: Wed, 15 Jan 2025 11:37:32 -0500
Subject: [PATCH 2/2] Update docs

---
 docs/README.md        | 9 ++++++++-
 examples/gcp/main.tf  | 2 +-
 gcp/infrastructure.tf | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index b9adaa38..4c9608ab 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -548,6 +548,13 @@ the operating system and service software
     and with x86-64 processors (see [NVIDIA/mig-parted issue #30](https://github.com/NVIDIA/mig-parted/issues/30)).
 6. `shard`: total number of [Sharding](https://slurm.schedmd.com/gres.html#Sharding) on the node. Sharding allows sharing the same GPU on multiple jobs. The total number of shards is evenly distributed across all GPUs on the node.
 
+The instance specifications are retrieved from the cloud provider data source, but it is possible to explicitly specify them.
+
+7. `cpus`: number of logical processors on the node - [`CPUs` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_CPUs)
+8. `ram`: size of real memory on the node in megabyte - [`RealMemory` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_RealMemory)
+9. `gpus`: number of graphical processor on the node - [`Gres=gpu:<gpus>` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1)
+10. `gpu_type`: type of graphical processor on the node - [`Gres=gpu:<gpu_type>:<gpus>` in slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1)
+
 For some cloud providers, it possible to define additional attributes.
 The following sections present the available attributes per provider.
 
@@ -585,7 +592,7 @@ For more information on these attributes, refer to
 - `gpu_type`: name of the GPU model to attach to the instance. Refer to
 [Google Cloud documentation](https://cloud.google.com/compute/docs/gpus) for the list of
 available models per region
-- `gpu_count`: number of GPUs of the `gpu_type` model to attach to the instance
+- `gpus`: number of GPUs of the `gpu_type` model to attach to the instance
 
 #### 4.7.3 Post build modification effect
 
diff --git a/examples/gcp/main.tf b/examples/gcp/main.tf
index df15007a..2b2ca4ac 100644
--- a/examples/gcp/main.tf
+++ b/examples/gcp/main.tf
@@ -24,8 +24,8 @@ module "gcp" {
       type = "n1-standard-2",
       tags = ["node"],
       count = 1,
+      gpus = 1
       gpu_type = "nvidia-tesla-t4",
-      gpu_count = 1
     }
   }
 
diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf
index d2d04560..575871be 100644
--- a/gcp/infrastructure.tf
+++ b/gcp/infrastructure.tf
@@ -168,7 +168,7 @@ locals {
       specs = merge({
         cpus   = data.external.machine_type[values["prefix"]].result["vcpus"]
         ram    = data.external.machine_type[values["prefix"]].result["ram"]
-        gpus   = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0))
+        gpus   = try(data.external.machine_type[values["prefix"]].result["gpus"], 0)
       }, values.specs)
       volumes = contains(keys(module.design.volume_per_instance), x) ? {
         for pv_key, pv_values in var.volumes: