Skip to content

Commit

Permalink
Nutanix GPU support implementation (#8745)
Browse files Browse the repository at this point in the history
* Initial GPU support implementation

* Fix test

* Fix comments
  • Loading branch information
adiantum authored Oct 10, 2024
1 parent d20a271 commit 4d1408c
Show file tree
Hide file tree
Showing 15 changed files with 1,960 additions and 15 deletions.
28 changes: 28 additions & 0 deletions pkg/api/v1alpha1/nutanixmachineconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@ import (
// NutanixIdentifierType is an enumeration of different resource identifier types.
type NutanixIdentifierType string

// NutanixGPUIdentifierType is an enumeration of different GPU identifier types.
type NutanixGPUIdentifierType string

func (c NutanixIdentifierType) String() string {
return string(c)
}

func (c NutanixGPUIdentifierType) String() string {
return string(c)
}

const (
// NutanixMachineConfigKind is the kind for a NutanixMachineConfig.
NutanixMachineConfigKind = "NutanixMachineConfig"
Expand All @@ -23,6 +30,11 @@ const (
// NutanixIdentifierName is a resource identifier identifying the object by Name.
NutanixIdentifierName NutanixIdentifierType = "name"

// NutanixGPUIdentifierDeviceID is a GPU identifier identifying the object by DeviceID.
NutanixGPUIdentifierDeviceID NutanixGPUIdentifierType = "deviceID"
// NutanixGPUIdentifierName is a GPU identifier identifying the object by Name.
NutanixGPUIdentifierName NutanixGPUIdentifierType = "name"

defaultNutanixOSFamily = Ubuntu
defaultNutanixSystemDiskSizeGi = "40Gi"
defaultNutanixMemorySizeGi = "4Gi"
Expand Down Expand Up @@ -62,6 +74,22 @@ type NutanixCategoryIdentifier struct {
Value string `json:"value,omitempty"`
}

// NutanixGPUIdentifier holds VM GPU device configuration.
type NutanixGPUIdentifier struct {
// deviceID is the device ID of the GPU device.
// +optional
DeviceID *int64 `json:"deviceID,omitempty"`

// vendorID is the vendor ID of the GPU device.
// +optional
Name string `json:"name,omitempty"`

// type is the type of the GPU device.
// +kubebuilder:validation:Required
// +kubebuilder:validation:Enum:=deviceID;name
Type NutanixGPUIdentifierType `json:"type"`
}

// NutanixMachineConfigGenerateOpt is a functional option that can be passed to NewNutanixMachineConfigGenerate to
// customize the generated machine config
//
Expand Down
4 changes: 4 additions & 0 deletions pkg/api/v1alpha1/nutanixmachineconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ type NutanixMachineConfigSpec struct {
// Categories must be created in Prism Central before they can be used.
// +kubebuilder:validation:Optional
AdditionalCategories []NutanixCategoryIdentifier `json:"additionalCategories,omitempty"`

// List of GPU devices that should be added to the VMs.
// +kubebuilder:validation:Optional
GPUs []NutanixGPUIdentifier `json:"gpus,omitempty"`
}

// SetDefaults sets defaults to NutanixMachineConfig if user has not provided.
Expand Down
27 changes: 27 additions & 0 deletions pkg/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ const (
ConfigMapKind = "ConfigMap"
ClusterResourceSetKind = "ClusterResourceSet"

NutanixMachineConfigKind = "NutanixMachineConfig"

BottlerocketDefaultUser = "ec2-user"
UbuntuDefaultUser = "capv"

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/nutanix/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

type Client interface {
GetSubnet(ctx context.Context, uuid string) (*v3.SubnetIntentResponse, error)
ListAllHost(ctx context.Context) (*v3.HostListResponse, error)
ListSubnet(ctx context.Context, getEntitiesRequest *v3.DSMetadata) (*v3.SubnetListIntentResponse, error)
GetImage(ctx context.Context, uuid string) (*v3.ImageIntentResponse, error)
ListImage(ctx context.Context, getEntitiesRequest *v3.DSMetadata) (*v3.ImageListIntentResponse, error)
Expand Down
12 changes: 12 additions & 0 deletions pkg/providers/nutanix/config/md-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,18 @@ spec:
value: "{{ .Value }}"
{{- end }}
{{- end }}
{{- if .GPUs }}
gpus:
{{- range .GPUs }}
{{- if (eq .Type "deviceID") }}
- type: deviceID
deviceID: {{ .DeviceID }}
{{- else if (eq .Type "name") }}
- type: name
name: "{{ .Name }}"
{{- end }}
{{- end }}
{{- end }}
---
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
Expand Down
15 changes: 15 additions & 0 deletions pkg/providers/nutanix/mocks/client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/providers/nutanix/provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ func TestNutanixProviderSetupAndValidateCreate(t *testing.T) {
},
}
mockClient.EXPECT().ListImage(gomock.Any(), gomock.Any()).Return(images, nil).AnyTimes()
mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil).AnyTimes()
mockCertValidator := mockCrypto.NewMockTlsValidator(ctrl)
mockCertValidator.EXPECT().ValidateCert(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil)
mockCertValidator.EXPECT().ValidateCert(gomock.Any(), gomock.Any(), gomock.Any()).Return(errors.New("invalid cert"))
Expand Down
4 changes: 4 additions & 0 deletions pkg/providers/nutanix/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,10 @@ func buildTemplateMapMD(clusterSpec *cluster.Spec, workerNodeGroupMachineSpec v1
values["additionalCategories"] = workerNodeGroupMachineSpec.AdditionalCategories
}

if len(workerNodeGroupMachineSpec.GPUs) > 0 {
values["GPUs"] = workerNodeGroupMachineSpec.GPUs
}

if workerNodeGroupConfiguration.KubeletConfiguration != nil {
wnKubeletConfig := workerNodeGroupConfiguration.KubeletConfiguration.Object
if _, ok := wnKubeletConfig["tlsCipherSuites"]; !ok {
Expand Down
46 changes: 46 additions & 0 deletions pkg/providers/nutanix/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,52 @@ func TestTemplateBuilderFailureDomains(t *testing.T) {
}
}

func TestTemplateBuilderGPUs(t *testing.T) {
for _, tc := range []struct {
Input string
Output string
OutputMD string
}{
{
Input: "testdata/eksa-cluster-gpus.yaml",
Output: "testdata/expected_results_gpus.yaml",
OutputMD: "testdata/expected_results_gpus_md.yaml",
},
} {
clusterSpec := test.NewFullClusterSpec(t, tc.Input)

machineCfg := clusterSpec.NutanixMachineConfig(clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name)
workerConfs := map[string]anywherev1.NutanixMachineConfigSpec{
"eksa-unit-test": machineCfg.Spec,
}

t.Setenv(constants.EksaNutanixUsernameKey, "admin")
t.Setenv(constants.EksaNutanixPasswordKey, "password")
creds := GetCredsFromEnv()

bldr := NewNutanixTemplateBuilder(&clusterSpec.NutanixDatacenter.Spec, &machineCfg.Spec, &machineCfg.Spec,
workerConfs, creds, time.Now)

cpSpec, err := bldr.GenerateCAPISpecControlPlane(clusterSpec)
assert.NoError(t, err)
assert.NotNil(t, cpSpec)
test.AssertContentToFile(t, string(cpSpec), tc.Output)

workloadTemplateNames := map[string]string{
"eksa-unit-test": "eksa-unit-test",
}
kubeadmconfigTemplateNames := map[string]string{
"eksa-unit-test": "eksa-unit-test",
}

data, err := bldr.GenerateCAPISpecWorkers(clusterSpec, workloadTemplateNames, kubeadmconfigTemplateNames)

assert.NoError(t, err)

test.AssertContentToFile(t, string(data), tc.OutputMD)
}
}

func minimalNutanixConfigSpec(t *testing.T) (*anywherev1.NutanixDatacenterConfig, *anywherev1.NutanixMachineConfig, map[string]anywherev1.NutanixMachineConfigSpec) {
dcConf := &anywherev1.NutanixDatacenterConfig{}
err := yaml.Unmarshal([]byte(nutanixDatacenterConfigSpec), dcConf)
Expand Down
75 changes: 75 additions & 0 deletions pkg/providers/nutanix/testdata/eksa-cluster-gpus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
apiVersion: anywhere.eks.amazonaws.com/v1alpha1
kind: Cluster
metadata:
name: eksa-unit-test
namespace: default
spec:
kubernetesVersion: "1.19"
controlPlaneConfiguration:
name: eksa-unit-test
count: 3
endpoint:
host: test-ip
machineGroupRef:
name: eksa-unit-test
kind: NutanixMachineConfig
workerNodeGroupConfigurations:
- count: 4
name: eksa-unit-test
machineGroupRef:
name: eksa-unit-test
kind: NutanixMachineConfig
datacenterRef:
kind: NutanixDatacenterConfig
name: eksa-unit-test
clusterNetwork:
cni: "cilium"
pods:
cidrBlocks:
- 192.168.0.0/16
services:
cidrBlocks:
- 10.96.0.0/12
---
apiVersion: anywhere.eks.amazonaws.com/v1alpha1
kind: NutanixDatacenterConfig
metadata:
name: eksa-unit-test
namespace: default
spec:
endpoint: "prism.nutanix.com"
port: 9440
credentialRef:
kind: Secret
name: "nutanix-credentials"
---
apiVersion: anywhere.eks.amazonaws.com/v1alpha1
kind: NutanixMachineConfig
metadata:
name: eksa-unit-test
namespace: default
spec:
vcpusPerSocket: 1
vcpuSockets: 4
memorySize: 8Gi
image:
type: "name"
name: "prism-image"
cluster:
type: "name"
name: "prism-cluster"
subnet:
type: "name"
name: "prism-subnet"
gpus:
- type: deviceID
deviceID: 8757
- type: name
name: "Ampere 40"
systemDiskSize: 40Gi
osFamily: "ubuntu"
users:
- name: "mySshUsername"
sshAuthorizedKeys:
- "mySshAuthorizedKey"
---
Loading

0 comments on commit 4d1408c

Please sign in to comment.