From 3c6545fcc3880d719915444bfe361cb196b31298 Mon Sep 17 00:00:00 2001 From: Jeev B Date: Thu, 17 Aug 2023 15:48:57 -0700 Subject: [PATCH] Fix affinity injection logic if partition size is not provided Signed-off-by: Jeev B --- .../pluginmachinery/flytek8s/pod_helper.go | 14 +++ .../flytek8s/pod_helper_test.go | 116 ++++++++++++------ 2 files changed, 92 insertions(+), 38 deletions(-) diff --git a/go/tasks/pluginmachinery/flytek8s/pod_helper.go b/go/tasks/pluginmachinery/flytek8s/pod_helper.go index 8bd6b7b7a..81eadd2c1 100755 --- a/go/tasks/pluginmachinery/flytek8s/pod_helper.go +++ b/go/tasks/pluginmachinery/flytek8s/pod_helper.go @@ -110,6 +110,7 @@ func ApplyNodeSelectors(podSpec *v1.PodSpec, selectors ...*core.Selector) { podSpec.Affinity = &v1.Affinity{} } + gpuPartitionSizeSpecified := false for _, selector := range selectors { var ns v1.NodeSelectorRequirement switch selector.GetSelection().(type) { @@ -120,6 +121,7 @@ func ApplyNodeSelectors(podSpec *v1.PodSpec, selectors ...*core.Selector) { Values: []string{selector.GetGpuDevice()}, } case *core.Selector_GpuPartitionSize: + gpuPartitionSizeSpecified = true ns = v1.NodeSelectorRequirement{ Key: config.GetK8sPluginConfig().GpuPartitionSizeNodeLabel, Operator: v1.NodeSelectorOpIn, @@ -133,6 +135,18 @@ func ApplyNodeSelectors(podSpec *v1.PodSpec, selectors ...*core.Selector) { AddRequiredNodeSelectorRequirements(podSpec.Affinity, ns) } } + + // If a gpu partition size selector was not specified, we assume that the user + // wants full, unpartitioned GPUs. + if !gpuPartitionSizeSpecified { + AddRequiredNodeSelectorRequirements( + podSpec.Affinity, + v1.NodeSelectorRequirement{ + Key: config.GetK8sPluginConfig().GpuPartitionSizeNodeLabel, + Operator: v1.NodeSelectorOpDoesNotExist, + }, + ) + } } // UpdatePod updates the base pod spec used to execute tasks. This is configured with plugins and task metadata-specific options diff --git a/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go b/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go index 3895dc2ee..0b649751f 100755 --- a/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go +++ b/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go @@ -417,56 +417,96 @@ func TestApplyInterruptibleNodeAffinity(t *testing.T) { } func TestApplyNodeSelectors(t *testing.T) { - podSpec := &v1.PodSpec{ - Affinity: &v1.Affinity{}, - } - ApplyNodeSelectors( - podSpec, - &core.Selector{ - Selection: &core.Selector_GpuDevice{ - GpuDevice: "nvidia-tesla-a100", + t.Run("with gpu device and partition size", func(t *testing.T) { + podSpec := &v1.PodSpec{ + Affinity: &v1.Affinity{}, + } + ApplyNodeSelectors( + podSpec, + &core.Selector{ + Selection: &core.Selector_GpuDevice{ + GpuDevice: "nvidia-tesla-a100", + }, }, - }, - &core.Selector{ - Selection: &core.Selector_GpuPartitionSize{ - GpuPartitionSize: "1g.5gb", + &core.Selector{ + Selection: &core.Selector_GpuPartitionSize{ + GpuPartitionSize: "1g.5gb", + }, + OnlyPreferred: true, }, - OnlyPreferred: true, - }, - ) - assert.EqualValues( - t, - []v1.NodeSelectorTerm{ - v1.NodeSelectorTerm{ - MatchExpressions: []v1.NodeSelectorRequirement{ - v1.NodeSelectorRequirement{ - Key: config.GetK8sPluginConfig().GpuDeviceNodeLabel, - Operator: v1.NodeSelectorOpIn, - Values: []string{"nvidia-tesla-a100"}, + ) + assert.EqualValues( + t, + []v1.NodeSelectorTerm{ + v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + v1.NodeSelectorRequirement{ + Key: config.GetK8sPluginConfig().GpuDeviceNodeLabel, + Operator: v1.NodeSelectorOpIn, + Values: []string{"nvidia-tesla-a100"}, + }, }, }, }, - }, - podSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, - ) - assert.EqualValues( - t, - []v1.PreferredSchedulingTerm{ - v1.PreferredSchedulingTerm{ - Weight: 10, - Preference: v1.NodeSelectorTerm{ + podSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, + ) + assert.EqualValues( + t, + []v1.PreferredSchedulingTerm{ + v1.PreferredSchedulingTerm{ + Weight: 10, + Preference: v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + v1.NodeSelectorRequirement{ + Key: config.GetK8sPluginConfig().GpuPartitionSizeNodeLabel, + Operator: v1.NodeSelectorOpIn, + Values: []string{"1g.5gb"}, + }, + }, + }, + }, + }, + podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, + ) + }) + + t.Run("with gpu device but without partition size", func(t *testing.T) { + podSpec := &v1.PodSpec{ + Affinity: &v1.Affinity{}, + } + ApplyNodeSelectors( + podSpec, + &core.Selector{ + Selection: &core.Selector_GpuDevice{ + GpuDevice: "nvidia-tesla-a100", + }, + }, + ) + assert.EqualValues( + t, + []v1.NodeSelectorTerm{ + v1.NodeSelectorTerm{ MatchExpressions: []v1.NodeSelectorRequirement{ v1.NodeSelectorRequirement{ - Key: config.GetK8sPluginConfig().GpuPartitionSizeNodeLabel, + Key: config.GetK8sPluginConfig().GpuDeviceNodeLabel, Operator: v1.NodeSelectorOpIn, - Values: []string{"1g.5gb"}, + Values: []string{"nvidia-tesla-a100"}, + }, + v1.NodeSelectorRequirement{ + Key: config.GetK8sPluginConfig().GpuPartitionSizeNodeLabel, + Operator: v1.NodeSelectorOpDoesNotExist, }, }, }, }, - }, - podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, - ) + podSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, + ) + assert.EqualValues( + t, + 0, + len(podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution), + ) + }) } func updatePod(t *testing.T) {