Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Failure Domains for Worker nodes feature for Nutanix provider #8837

Merged
merged 4 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ spec:
- type
type: object
type: array
workerMachineGroups:
description: Worker Machine Groups holds the list of worker
machine group names that will use this failure domain.
items:
type: string
type: array
required:
- name
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,28 @@ spec:
required:
- type
type: object
gpus:
description: List of GPU devices that should be added to the VMs.
items:
description: NutanixGPUIdentifier holds VM GPU device configuration.
properties:
deviceID:
description: deviceID is the device ID of the GPU device.
format: int64
type: integer
name:
description: vendorID is the vendor ID of the GPU device.
type: string
type:
description: type is the type of the GPU device.
enum:
- deviceID
- name
type: string
required:
- type
type: object
type: array
image:
description: image is to identify the OS image uploaded to the Prism
Central (PC) The image identifier (uuid or name) can be obtained
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ require (
go.opentelemetry.io/otel/trace v1.20.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/sync v0.6.0 // indirect
golang.org/x/sync v0.6.0
golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/time v0.5.0 // indirect
Expand Down
6 changes: 5 additions & 1 deletion pkg/api/v1alpha1/nutanixdatacenterconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ type NutanixDatacenterFailureDomain struct {
// Subnets holds the list of subnets identifiers cluster's network subnets.
// +kubebuilder:validation:Required
Subnets []NutanixResourceIdentifier `json:"subnets,omitempty"`

// Worker Machine Groups holds the list of worker machine group names that will use this failure domain.
// +optional
WorkerMachineGroups []string `json:"workerMachineGroups,omitempty"`
}

// NutanixDatacenterConfigStatus defines the observed state of NutanixDatacenterConfig.
Expand Down Expand Up @@ -165,7 +169,7 @@ func (in *NutanixDatacenterConfig) Validate() error {
}
}

if in.Spec.FailureDomains != nil && len(in.Spec.FailureDomains) != 0 {
if len(in.Spec.FailureDomains) != 0 {
dccName := in.Namespace + "/" + in.Name
validateClusterResourceIdentifier := createValidateNutanixResourceFunc("NutanixDatacenterConfig.Spec.FailureDomains.Cluster", "cluster", dccName)
validateSubnetResourceIdentifier := createValidateNutanixResourceFunc("NutanixDatacenterConfig.Spec.FailureDomains.Subnets", "subnet", dccName)
Expand Down
5 changes: 5 additions & 0 deletions pkg/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 104 additions & 0 deletions pkg/providers/nutanix/config/md-template.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,106 @@
{{- if $.failureDomains -}}{{ range $fd := $.failureDomains -}}
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
metadata:
labels:
cluster.x-k8s.io/cluster-name: "{{$.clusterName}}"
name: "{{$.workerNodeGroupName}}-{{$fd.Name}}"
namespace: "{{$.eksaSystemNamespace}}"
{{- if $.autoscalingConfig }}
annotations:
cluster.x-k8s.io/cluster-api-autoscaler-node-group-min-size: "{{ $.autoscalingConfig.MinCount }}"
cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size: "{{ $.autoscalingConfig.MaxCount }}"
{{- end }}
spec:
clusterName: "{{$.clusterName}}"
{{- if not $.autoscalingConfig }}
replicas: {{ index $.failureDomainsReplicas $fd.Name }}
{{- end }}
selector:
matchLabels: {}
template:
metadata:
labels:
cluster.x-k8s.io/cluster-name: "{{$.clusterName}}"
spec:
failureDomain: "{{$fd.Name}}"
bootstrap:
configRef:
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
name: "{{$.workloadkubeadmconfigTemplateName}}"
clusterName: "{{$.clusterName}}"
infrastructureRef:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
name: "{{$.workloadTemplateName}}-{{$fd.Name}}"
version: "{{$.kubernetesVersion}}"
{{- if $.upgradeRolloutStrategy }}
strategy:
rollingUpdate:
maxSurge: {{$.maxSurge}}
maxUnavailable: {{$.maxUnavailable}}
{{- end }}
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
metadata:
name: "{{$.workloadTemplateName}}-{{$fd.Name}}"
namespace: "{{$.eksaSystemNamespace}}"
spec:
template:
spec:
providerID: "nutanix://{{$.clusterName}}-m1"
vcpusPerSocket: {{$.vcpusPerSocket}}
vcpuSockets: {{$.vcpuSockets}}
memorySize: {{$.memorySize}}
systemDiskSize: {{$.systemDiskSize}}
image:
{{- if (eq $.imageIDType "name") }}
type: name
name: "{{$.imageName}}"
{{ else if (eq $.imageIDType "uuid") }}
type: uuid
uuid: "{{$.imageUUID}}"
{{ end }}
cluster:
{{- if (eq $fd.Cluster.Type "name") }}
type: name
name: "{{ $fd.Cluster.Name }}"
{{- else if (eq $fd.Cluster.Type "uuid") }}
type: uuid
uuid: "{{ $fd.Cluster.UUID }}"
{{ end }}
subnet:
{{- range $subnet := $fd.Subnets }}
{{- if (eq $subnet.Type "name") }}
- type: name
name: "{{ $subnet.Name }}"
{{- else if (eq $subnet.Type "uuid") }}
- type: uuid
uuid: "{{ $subnet.UUID }}"
{{- end }}
{{- end }}
{{- if $.projectIDType}}
project:
{{- if (eq $.projectIDType "name") }}
type: name
name: "{{$.projectName}}"
{{- else if (eq $.projectIDType "uuid") }}
type: uuid
uuid: "{{$.projectUUID}}"
{{ end }}
{{ end }}
{{- if $.additionalCategories}}
additionalCategories:
{{- range $.additionalCategories}}
- key: "{{ $.Key }}"
value: "{{ $.Value }}"
{{- end }}
{{- end }}
---
{{ end -}}
{{- else -}}
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
metadata:
Expand Down Expand Up @@ -107,6 +210,7 @@ spec:
{{- end }}
{{- end }}
---
{{ end -}}
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
metadata:
Expand Down
12 changes: 6 additions & 6 deletions pkg/providers/nutanix/provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,13 +312,13 @@ func TestNutanixProviderSetupAndValidateCreate(t *testing.T) {
name: "cluster config with unsupported upgrade strategy configuration for cp",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_cp.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
{
name: "cluster config with unsupported upgrade strategy configuration for md",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_md.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
}

Expand Down Expand Up @@ -508,13 +508,13 @@ func TestNutanixProviderSetupAndValidateDeleteCluster(t *testing.T) {
name: "cluster config with unsupported upgrade strategy configuration for cp",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_cp.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
{
name: "cluster config with unsupported upgrade strategy configuration for md",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_md.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
}

Expand Down Expand Up @@ -560,13 +560,13 @@ func TestNutanixProviderSetupAndValidateUpgradeCluster(t *testing.T) {
name: "cluster config with unsupported upgrade strategy configuration for cp",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_cp.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
{
name: "cluster config with unsupported upgrade strategy configuration for md",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_md.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
}

Expand Down
45 changes: 45 additions & 0 deletions pkg/providers/nutanix/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,10 +346,53 @@ func buildTemplateMapCP(
return values, nil
}

func calcFailureDomainReplicas(workerNodeGroupConfiguration v1alpha1.WorkerNodeGroupConfiguration, failureDomains []v1alpha1.NutanixDatacenterFailureDomain) map[string]int {
replicasPerFailureDomain := make(map[string]int)
failureDomainCount := len(failureDomains)

if workerNodeGroupConfiguration.AutoScalingConfiguration != nil {
return replicasPerFailureDomain
}

if failureDomainCount == 0 {
return replicasPerFailureDomain
}

workerNodeGroupCount := failureDomainCount
if workerNodeGroupConfiguration.Count != nil {
workerNodeGroupCount = int(*workerNodeGroupConfiguration.Count)
}

minCount := int(workerNodeGroupCount / failureDomainCount)

for i := 0; i < len(failureDomains); i++ {
replicasPerFailureDomain[failureDomains[i].Name] = minCount
}
replicasPerFailureDomain[failureDomains[0].Name] = workerNodeGroupCount - (failureDomainCount-1)*minCount
adiantum marked this conversation as resolved.
Show resolved Hide resolved

return replicasPerFailureDomain
}

func getFailureDomainsForWorkerNodeGroup(allFailureDomains []v1alpha1.NutanixDatacenterFailureDomain, workerNodeGroupConfigurationName string) []v1alpha1.NutanixDatacenterFailureDomain {
result := make([]v1alpha1.NutanixDatacenterFailureDomain, 0)
for _, fd := range allFailureDomains {
for _, workerMachineGroup := range fd.WorkerMachineGroups {
if workerMachineGroup == workerNodeGroupConfigurationName {
result = append(result, fd)
}
}
}

return result
}

func buildTemplateMapMD(clusterSpec *cluster.Spec, workerNodeGroupMachineSpec v1alpha1.NutanixMachineConfigSpec, workerNodeGroupConfiguration v1alpha1.WorkerNodeGroupConfiguration) (map[string]interface{}, error) {
versionsBundle := clusterSpec.WorkerNodeGroupVersionsBundle(workerNodeGroupConfiguration)
format := "cloud-config"

failureDomainsForWorkerNodeGroup := getFailureDomainsForWorkerNodeGroup(clusterSpec.NutanixDatacenter.Spec.FailureDomains, workerNodeGroupConfiguration.Name)
replicasPerFailureDomain := calcFailureDomainReplicas(workerNodeGroupConfiguration, failureDomainsForWorkerNodeGroup)

values := map[string]interface{}{
"clusterName": clusterSpec.Cluster.Name,
"eksaSystemNamespace": constants.EksaSystemNamespace,
Expand All @@ -374,6 +417,8 @@ func buildTemplateMapMD(clusterSpec *cluster.Spec, workerNodeGroupMachineSpec v1
"subnetUUID": workerNodeGroupMachineSpec.Subnet.UUID,
"workerNodeGroupName": fmt.Sprintf("%s-%s", clusterSpec.Cluster.Name, workerNodeGroupConfiguration.Name),
"workerNodeGroupTaints": workerNodeGroupConfiguration.Taints,
"failureDomains": failureDomainsForWorkerNodeGroup,
"failureDomainsReplicas": replicasPerFailureDomain,
}

if clusterSpec.Cluster.Spec.RegistryMirrorConfiguration != nil {
Expand Down
Loading
Loading