From 0de0032727a286b53691594928f2a1e3864b64a1 Mon Sep 17 00:00:00 2001 From: Honigeintopf Date: Mon, 4 Nov 2024 14:40:51 +0100 Subject: [PATCH] Added integration tests and deletion of fw after unhealthytimeout --- controllers/set/delete.go | 51 ++++++------ controllers/set/reconcile.go | 2 +- integration/integration_test.go | 116 +++++++++++++++++----------- integration/metal_resources_test.go | 86 +++++++++++++++++++++ integration/suite_test.go | 2 +- 5 files changed, 186 insertions(+), 71 deletions(-) diff --git a/controllers/set/delete.go b/controllers/set/delete.go index fa8f985..4988449 100644 --- a/controllers/set/delete.go +++ b/controllers/set/delete.go @@ -46,32 +46,28 @@ func (c *controller) deleteFirewalls(r *controllers.Ctx[*v2.FirewallSet], fws .. return nil } -func (c *controller) deleteAfterTimeout(r *controllers.Ctx[*v2.FirewallSet], fws ...*v2.Firewall) ([]*v2.Firewall, error) { +func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallSet], fws ...*v2.Firewall) ([]*v2.Firewall, error) { var result []*v2.Firewall for _, fw := range fws { fw := fw - if fw.Status.Phase != v2.FirewallPhaseCreating { - continue - } - connected := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerConnected)).Status == v2.ConditionTrue if c.isFirewallUnhealthy(fw) { - allocationTimestamp := pointer.SafeDeref(fw.Status.MachineStatus).AllocationTimestamp - if time.Since(allocationTimestamp.Time) > c.c.GetFirewallHealthTimeout() { - r.Log.Info("unhealthy firewall not recovering, deleting from set", "firewall-name", fw.Name) - - err := c.deleteFirewalls(r, fw) - if err != nil { - return nil, err - } - - result = append(result, fw) - continue + r.Log.Info("unhealthy firewall not recovering, deleting from set", "firewall-name", fw.Name) + err := c.deleteFirewalls(r, fw) + if err != nil { + return nil, err } + result = append(result, fw) + continue } + + if fw.Status.Phase != v2.FirewallPhaseCreating { + continue + } + if !connected && time.Since(fw.CreationTimestamp.Time) > c.c.GetCreateTimeout() { r.Log.Info("firewall not getting ready, deleting from set", "firewall-name", fw.Name) @@ -89,11 +85,22 @@ func (c *controller) deleteAfterTimeout(r *controllers.Ctx[*v2.FirewallSet], fws } func (c *controller) isFirewallUnhealthy(fw *v2.Firewall) bool { - created := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallCreated)).Status == v2.ConditionTrue - ready := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallReady)).Status == v2.ConditionTrue - connected := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerConnected)).Status == v2.ConditionTrue - seedConnected := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected)).Status == v2.ConditionTrue - distance := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallDistanceConfigured)).Status == v2.ConditionTrue - return !(created && ready && connected && seedConnected && distance) + var ( + created = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallCreated)).Status == v2.ConditionTrue + ready = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallReady)).Status == v2.ConditionTrue + connected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerConnected)).Status == v2.ConditionTrue + seedConnected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected)).Status == v2.ConditionTrue + distance = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallDistanceConfigured)).Status == v2.ConditionTrue + ) + + if created && ready && connected && seedConnected && distance { + return false + } + + if created && time.Since(pointer.SafeDeref(fw.Status.MachineStatus).AllocationTimestamp.Time) > c.c.GetFirewallHealthTimeout() { + return true + } + + return false } diff --git a/controllers/set/reconcile.go b/controllers/set/reconcile.go index 6591ea3..53ca2dd 100644 --- a/controllers/set/reconcile.go +++ b/controllers/set/reconcile.go @@ -109,7 +109,7 @@ func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { } } - deletedFws, err := c.deleteAfterTimeout(r, ownedFirewalls...) + deletedFws, err := c.deleteIfUnhealthyOrTimeout(r, ownedFirewalls...) if err != nil { return err } diff --git a/integration/integration_test.go b/integration/integration_test.go index b4f6421..7d48b99 100644 --- a/integration/integration_test.go +++ b/integration/integration_test.go @@ -162,53 +162,6 @@ var _ = Context("integration test", Ordered, func() { Expect(client.IgnoreAlreadyExists(k8sClient.Create(ctx, shootTokenSecret.DeepCopy()))).To(Succeed()) }) - When("creating a firewall deployment that simulates unhealthiness", Ordered, func() { - var fwSet *v2.FirewallSet - - BeforeAll(func() { - // Create the Firewall Deployment - fwDeployment := deployment() - Expect(k8sClient.Create(ctx, fwDeployment)).To(Succeed()) - - // Wait for the FirewallSet to be created - Eventually(func() error { - fwSetList := &v2.FirewallSetList{} - err := k8sClient.List(ctx, fwSetList, client.InNamespace(namespaceName)) - if err != nil { - return err - } - if len(fwSetList.Items) == 0 { - return fmt.Errorf("no firewall sets found") - } - fwSet = &fwSetList.Items[0] - return nil - }, 15*time.Second, interval).Should(Succeed(), "FirewallSet should be created") - }) - - It("should update the deployment status to reflect the unhealthy replica", func() { - // Simulate unhealthiness by updating the FirewallSet status - fwSet.Status.UnhealthyReplicas = 1 - Expect(k8sClient.Status().Update(ctx, fwSet)).To(Succeed()) - - // Wait for the deployment status to reflect the unhealthy replica - Eventually(func() int { - fetchedDeployment := &v2.FirewallDeployment{} - Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(deployment()), fetchedDeployment)).To(Succeed()) - return fetchedDeployment.Status.UnhealthyReplicas - }, 15*time.Second, interval).Should(Equal(1), "unhealthy replicas should be reported") - }) - - It("should eventually replace the unhealthy firewall", func() { - // Wait for the controller to replace the unhealthy firewall - Eventually(func() bool { - fwSetList := &v2.FirewallSetList{} - Expect(k8sClient.List(ctx, fwSetList, client.InNamespace(namespaceName))).To(Succeed()) - // Check if a new FirewallSet has been created - return len(fwSetList.Items) > 1 - }, 60*time.Second, interval).Should(BeTrue(), "A new FirewallSet should be created to replace the unhealthy one") - }) - }) - Describe("the rolling update", Ordered, func() { When("creating a firewall deployment", Ordered, func() { It("the creation works", func() { @@ -1960,4 +1913,73 @@ var _ = Context("integration test", Ordered, func() { }) + When("creating a firewall set that simulates unhealthiness", Ordered, func() { + var firewallSet *v2.FirewallSet + + BeforeAll(func() { + swapMetalClient(&metalclient.MetalMockFns{ + Firewall: func(m *mock.Mock) { + m.On("AllocateFirewall", mock.Anything, nil).Return(&metalfirewall.AllocateFirewallOK{Payload: firewall3}, nil).Maybe() + m.On("FindFirewall", mock.Anything, nil).Return(&metalfirewall.FindFirewallOK{Payload: firewall3}, nil).Maybe() + m.On("FindFirewalls", mock.Anything, nil).Return(&metalfirewall.FindFirewallsOK{Payload: []*models.V1FirewallResponse{firewall3}}, nil).Maybe() + }, + Network: func(m *mock.Mock) { + m.On("FindNetwork", mock.Anything, nil).Return(&network.FindNetworkOK{Payload: network1}, nil).Maybe() + }, + Machine: func(m *mock.Mock) { + m.On("UpdateMachine", mock.Anything, nil).Return(&machine.UpdateMachineOK{Payload: &models.V1MachineResponse{}}, nil).Maybe() + m.On("FreeMachine", mock.Anything, nil).Return(&machine.FreeMachineOK{Payload: &models.V1MachineResponse{ID: firewall3.ID}}, nil).Maybe() + }, + Image: func(m *mock.Mock) { + m.On("FindLatestImage", mock.Anything, nil).Return(&image.FindLatestImageOK{Payload: image1}, nil).Maybe() + }, + }) + + Expect(k8sClient.Create(ctx, deployment())).To(Succeed()) + Eventually(func() error { + firewallSetList := &v2.FirewallSetList{} + err := k8sClient.List(ctx, firewallSetList, client.InNamespace(namespaceName)) + if err != nil { + return err + } + if len(firewallSetList.Items) == 0 { + return fmt.Errorf("no firewall sets found") + } + firewallSet = &firewallSetList.Items[0] + return nil + }, 15*time.Second, interval).Should(Succeed(), "FirewallSet should be created") + }) + + It("should simulate unhealthiness and trigger deletion", func() { + firewallList := &v2.FirewallList{} + Eventually(func() int { + + err := k8sClient.List(ctx, firewallList, client.InNamespace(firewallSet.Namespace)) + if err != nil { + return 0 + } + return len(firewallList.Items) + }, 15*time.Second, interval).Should(BeNumerically(">", 0), "Should have at least one firewall") + + By("waiting for the firewall to be deleted") + Eventually(func() bool { + for _, fw := range firewallList.Items { + err := k8sClient.Get(ctx, client.ObjectKeyFromObject(&fw), &v2.Firewall{}) + if !apierrors.IsNotFound(err) { + return false + } + } + return true + }, 10*time.Second, interval).Should(BeTrue(), "All Firewalls should be deleted") + + By("verifying that a new firewall has been created") + Eventually(func() int { + newFirewallList := &v2.FirewallList{} + Expect(k8sClient.List(ctx, newFirewallList, client.InNamespace(firewallSet.Namespace))).To(Succeed()) + return len(newFirewallList.Items) + }, 10*time.Second, interval).Should(Equal(1), "A new firewall should be created") + }) + + }) + }) diff --git a/integration/metal_resources_test.go b/integration/metal_resources_test.go index ef87655..00b2c86 100644 --- a/integration/metal_resources_test.go +++ b/integration/metal_resources_test.go @@ -311,6 +311,92 @@ var ( Vrf: 50, Vrfshared: true, } + firewall3 = &models.V1FirewallResponse{ + Allocation: &models.V1MachineAllocation{ + BootInfo: &models.V1BootInfo{ + Bootloaderid: pointer.Pointer("bootloaderid"), + Cmdline: pointer.Pointer("cmdline"), + ImageID: pointer.Pointer("imageid"), + Initrd: pointer.Pointer("initrd"), + Kernel: pointer.Pointer("kernel"), + OsPartition: pointer.Pointer("ospartition"), + PrimaryDisk: pointer.Pointer("primarydisk"), + }, + Created: pointer.Pointer(strfmt.DateTime(testTime.Add(-20 * 24 * time.Hour))), + Creator: pointer.Pointer("creator"), + Description: "firewall allocation 3", + Filesystemlayout: fsl1, + Hostname: pointer.Pointer("firewall-hostname-3"), + Image: image1, + Name: pointer.Pointer("firewall-3"), + Networks: []*models.V1MachineNetwork{ + { + Asn: pointer.Pointer(int64(200)), + Destinationprefixes: []string{"2.2.2.2"}, + Ips: []string{"1.1.1.1"}, + Nat: pointer.Pointer(false), + Networkid: pointer.Pointer("private"), + Networktype: pointer.Pointer(net.PrivatePrimaryUnshared), + Prefixes: []string{"prefixes"}, + Private: pointer.Pointer(true), + Underlay: pointer.Pointer(false), + Vrf: pointer.Pointer(int64(100)), + }, + }, + Project: pointer.Pointer("project-1"), + Reinstall: pointer.Pointer(false), + Role: pointer.Pointer(models.V1MachineAllocationRoleFirewall), + SSHPubKeys: []string{"sshpubkey"}, + Succeeded: pointer.Pointer(true), + UserData: "---userdata---", + }, + Bios: &models.V1MachineBIOS{ + Date: pointer.Pointer("biosdata"), + Vendor: pointer.Pointer("biosvendor"), + Version: pointer.Pointer("biosversion"), + }, + Description: "firewall 1", + Events: &models.V1MachineRecentProvisioningEvents{ + CrashLoop: pointer.Pointer(true), + FailedMachineReclaim: pointer.Pointer(true), + LastErrorEvent: &models.V1MachineProvisioningEvent{ + Event: pointer.Pointer("Crashed"), + Message: "crash", + Time: strfmt.DateTime(testTime.Add(-10 * 24 * time.Hour)), + }, + LastEventTime: strfmt.DateTime(testTime.Add(-7 * 24 * time.Hour)), + Log: []*models.V1MachineProvisioningEvent{ + { + Event: pointer.Pointer("Phoned Home"), + Message: "phoning home", + Time: strfmt.DateTime(testTime.Add(-7 * 24 * time.Hour)), + }, + }, + }, + Hardware: &models.V1MachineHardware{ + CPUCores: pointer.Pointer(int32(16)), + Disks: []*models.V1MachineBlockDevice{}, + Memory: pointer.Pointer(int64(32)), + Nics: []*models.V1MachineNic{}, + }, + ID: pointer.Pointer("3"), + Ledstate: &models.V1ChassisIdentifyLEDState{ + Description: pointer.Pointer(""), + Value: pointer.Pointer(""), + }, + Liveliness: pointer.Pointer("Unhealthy"), + Name: "firewall-3", + Partition: partition1, + Rackid: "rack-1", + Size: size1, + State: &models.V1MachineState{ + Description: pointer.Pointer("state"), + Issuer: "issuer", + MetalHammerVersion: pointer.Pointer("version"), + Value: pointer.Pointer(""), + }, + Tags: []string{"a"}, + } ) // we are sharing a client for the tests, so we need to make sure we do not run contradicting tests in parallel diff --git a/integration/suite_test.go b/integration/suite_test.go index 608e49f..e3834a9 100644 --- a/integration/suite_test.go +++ b/integration/suite_test.go @@ -130,7 +130,7 @@ var _ = BeforeSuite(func() { ClusterTag: fmt.Sprintf("%s=%s", tag.ClusterID, "cluster-a"), SafetyBackoff: 10 * time.Second, ProgressDeadline: 10 * time.Minute, - FirewallHealthTimeout: 20 * time.Minute, + FirewallHealthTimeout: 19 * 24 * time.Hour, CreateTimeout: 10 * time.Minute, }) Expect(err).ToNot(HaveOccurred())