diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index b4ec01ae..f3fe88e5 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -65,6 +65,7 @@ jobs: - "Workload cluster creation" - "Workload cluster scaling" - "Workload cluster upgrade" + - "Certificate Refresh" - "Orchestrated In place upgrades" # TODO(ben): Remove once all tests are running stable. fail-fast: false diff --git a/bootstrap/api/v1beta2/certificates_refresh_consts.go b/bootstrap/api/v1beta2/certificates_refresh_consts.go index ec53cd06..f4cbe39c 100644 --- a/bootstrap/api/v1beta2/certificates_refresh_consts.go +++ b/bootstrap/api/v1beta2/certificates_refresh_consts.go @@ -1,7 +1,14 @@ package v1beta2 const ( - CertificatesRefreshAnnotation = "v1beta2.k8sd.io/refresh-certificates" + CertificatesRefreshAnnotation = "v1beta2.k8sd.io/refresh-certificates" + CertificatesRefreshStatusAnnotation = "v1beta2.k8sd.io/refresh-certificates-status" +) + +const ( + CertificatesRefreshInProgressStatus = "in-progress" + CertificatesRefreshDoneStatus = "done" + CertificatesRefreshFailedStatus = "failed" ) const ( diff --git a/bootstrap/controllers/certificates_controller.go b/bootstrap/controllers/certificates_controller.go index e6d89ade..0142d3f0 100644 --- a/bootstrap/controllers/certificates_controller.go +++ b/bootstrap/controllers/certificates_controller.go @@ -27,18 +27,25 @@ import ( // CertificatesReconciler reconciles a Machine's certificates. type CertificatesReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme - recorder record.EventRecorder - - K8sdDialTimeout time.Duration - + Log logr.Logger + Scheme *runtime.Scheme + recorder record.EventRecorder + K8sdDialTimeout time.Duration managementCluster ck8s.ManagementCluster } +type CertificatesScope struct { + Cluster *clusterv1.Cluster + Config *bootstrapv1.CK8sConfig + Log logr.Logger + Machine *clusterv1.Machine + Patcher *patch.Helper + Workload *ck8s.Workload +} + // SetupWithManager sets up the controller with the Manager. func (r *CertificatesReconciler) SetupWithManager(mgr ctrl.Manager) error { - if _, err := ctrl.NewControllerManagedBy(mgr).For(&clusterv1.Machine{}).Build(r); err != nil { + if err := ctrl.NewControllerManagedBy(mgr).For(&clusterv1.Machine{}).Complete(r); err != nil { return err } @@ -54,15 +61,6 @@ func (r *CertificatesReconciler) SetupWithManager(mgr ctrl.Manager) error { return nil } -type CertificatesScope struct { - Cluster *clusterv1.Cluster - Config *bootstrapv1.CK8sConfig - Log logr.Logger - Machine *clusterv1.Machine - Patcher *patch.Helper - Workload *ck8s.Workload -} - // +kubebuilder:rbac:groups=bootstrap.cluster.x-k8s.io,resources=ck8sconfigs,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=bootstrap.cluster.x-k8s.io,resources=ck8sconfigs/status,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters;clusters/status;machines;machines/status,verbs=get;list;watch @@ -77,97 +75,133 @@ func (r *CertificatesReconciler) Reconcile(ctx context.Context, req ctrl.Request if apierrors.IsNotFound(err) { return ctrl.Result{}, nil } - // Error reading the object - requeue the request. return ctrl.Result{}, err } + if m.Status.NodeRef == nil { + // If the machine does not have a node ref, we requeue the request to retry. + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + if !m.ObjectMeta.DeletionTimestamp.IsZero() { // Machine is being deleted, return early. return ctrl.Result{}, nil } mAnnotations := m.GetAnnotations() + if mAnnotations == nil { + mAnnotations = map[string]string{} + } var refreshCertificates, hasExpiryDateAnnotation bool _, refreshCertificates = mAnnotations[bootstrapv1.CertificatesRefreshAnnotation] _, hasExpiryDateAnnotation = mAnnotations[bootstrapv1.MachineCertificatesExpiryDateAnnotation] + + if mAnnotations[bootstrapv1.CertificatesRefreshStatusAnnotation] == bootstrapv1.CertificatesRefreshInProgressStatus { + if !refreshCertificates { + // If a refresh is in progress but the refresh annotation is missing + // clear the status. + delete(mAnnotations, bootstrapv1.CertificatesRefreshStatusAnnotation) + m.SetAnnotations(mAnnotations) + if err := r.Client.Update(ctx, m); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to clear status annotation: %w", err) + } + return ctrl.Result{}, nil + } + log.Info("Certificates refresh already in progress", + "refreshStatus", bootstrapv1.CertificatesRefreshInProgressStatus, + "refreshAnnotation", mAnnotations[bootstrapv1.CertificatesRefreshAnnotation], + ) + return ctrl.Result{}, nil + } + if !refreshCertificates && hasExpiryDateAnnotation { // No need to refresh certificates or update expiry date, return early. return ctrl.Result{}, nil } - // Look up for the CK8sConfig. + scope, err := r.createScope(ctx, m, log) + if err != nil { + return ctrl.Result{}, err + } + + if !hasExpiryDateAnnotation { + if err := r.updateExpiryDateAnnotation(ctx, scope); err != nil { + return ctrl.Result{}, err + } + } + + if refreshCertificates { + if err := r.refreshCertificates(ctx, scope); err != nil { + // On error, we requeue the request to retry. + mAnnotations[bootstrapv1.CertificatesRefreshStatusAnnotation] = bootstrapv1.CertificatesRefreshFailedStatus + m.SetAnnotations(mAnnotations) + if err := r.Client.Update(ctx, m); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to clear status annotation after error: %w", err) + } + return ctrl.Result{}, err + } + } + + return ctrl.Result{}, nil +} + +func (r *CertificatesReconciler) createScope(ctx context.Context, m *clusterv1.Machine, log logr.Logger) (*CertificatesScope, error) { config := &bootstrapv1.CK8sConfig{} if err := r.Client.Get(ctx, types.NamespacedName{Namespace: m.Namespace, Name: m.Spec.Bootstrap.ConfigRef.Name}, config); err != nil { - return ctrl.Result{}, err + return nil, fmt.Errorf("failed to get CK8sConfig: %w", err) } - // Get the owner of the CK8sConfig to determine if it's a control plane or worker node. configOwner, err := bsutil.GetConfigOwner(ctx, r.Client, config) - if err != nil { - log.Error(err, "Failed to get config owner") - return ctrl.Result{}, err - } - if configOwner == nil { - return ctrl.Result{}, nil + if err != nil || configOwner == nil { + return nil, fmt.Errorf("failed to get config owner: %w", err) } cluster, err := util.GetClusterByName(ctx, r.Client, m.GetNamespace(), m.Spec.ClusterName) if err != nil { - return ctrl.Result{}, err + return nil, fmt.Errorf("failed to get cluster: %w", err) } - microclusterPort := config.Spec.ControlPlaneConfig.GetMicroclusterPort() - workload, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster), microclusterPort) + workload, err := r.managementCluster.GetWorkloadCluster( + ctx, + util.ObjectKey(cluster), + config.Spec.ControlPlaneConfig.GetMicroclusterPort(), + ) if err != nil { - return ctrl.Result{}, err + return nil, fmt.Errorf("failed to get workload cluster: %w", err) } patchHelper, err := patch.NewHelper(m, r.Client) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to create patch helper for machine: %w", err) + return nil, fmt.Errorf("failed to create patch helper: %w", err) } - scope := &CertificatesScope{ + return &CertificatesScope{ Log: log, Machine: m, Config: config, Cluster: cluster, Patcher: patchHelper, Workload: workload, - } - - if !hasExpiryDateAnnotation { - if err := r.updateExpiryDateAnnotation(ctx, scope); err != nil { - return ctrl.Result{}, err - } - } - - if refreshCertificates { - if configOwner.IsControlPlaneMachine() { - if err := r.refreshControlPlaneCertificates(ctx, scope); err != nil { - return ctrl.Result{}, err - } - } else { - log.Info("worker nodes are not supported yet") - return ctrl.Result{}, nil - } - } - - return ctrl.Result{}, nil + }, nil } -func (r *CertificatesReconciler) refreshControlPlaneCertificates(ctx context.Context, scope *CertificatesScope) error { +func (r *CertificatesReconciler) refreshCertificates(ctx context.Context, scope *CertificatesScope) error { nodeToken, err := token.LookupNodeToken(ctx, r.Client, util.ObjectKey(scope.Cluster), scope.Machine.Name) if err != nil { return fmt.Errorf("failed to lookup node token: %w", err) } mAnnotations := scope.Machine.GetAnnotations() - refreshAnnotation, ok := mAnnotations[bootstrapv1.CertificatesRefreshAnnotation] if !ok { - return nil + return fmt.Errorf("refresh annotation not found") + } + + mAnnotations[bootstrapv1.CertificatesRefreshStatusAnnotation] = bootstrapv1.CertificatesRefreshInProgressStatus + scope.Machine.SetAnnotations(mAnnotations) + if err := scope.Patcher.Patch(ctx, scope.Machine); err != nil { + return fmt.Errorf("failed to set in-progress status: %w", err) } r.recorder.Eventf( @@ -179,16 +213,31 @@ func (r *CertificatesReconciler) refreshControlPlaneCertificates(ctx context.Con seconds, err := utiltime.TTLToSeconds(refreshAnnotation) if err != nil { - return fmt.Errorf("failed to parse expires-in annotation value: %w", err) + return fmt.Errorf("failed to parse TTL: %w", err) } - controlPlaneConfig := scope.Config.Spec.ControlPlaneConfig - controlPlaneEndpoint := scope.Cluster.Spec.ControlPlaneEndpoint.Host - - extraSANs := controlPlaneConfig.ExtraSANs - extraSANs = append(extraSANs, controlPlaneEndpoint) + var expirySecondsUnix int + configOwner, _ := bsutil.GetConfigOwner(ctx, r.Client, scope.Config) + if configOwner.IsControlPlaneMachine() { + var extraSANs []string + extraSANs = append(extraSANs, scope.Config.Spec.ControlPlaneConfig.ExtraSANs...) + extraSANs = append(extraSANs, scope.Cluster.Spec.ControlPlaneEndpoint.Host) + expirySecondsUnix, err = scope.Workload.RefreshControlPlaneCertificates( + ctx, + scope.Machine, + *nodeToken, + seconds, + extraSANs, + ) + } else { + expirySecondsUnix, err = scope.Workload.RefreshWorkerCertificates( + ctx, + scope.Machine, + *nodeToken, + seconds, + ) + } - expirySecondsUnix, err := scope.Workload.RefreshCertificates(ctx, scope.Machine, *nodeToken, seconds, extraSANs) if err != nil { r.recorder.Eventf( scope.Machine, @@ -200,10 +249,11 @@ func (r *CertificatesReconciler) refreshControlPlaneCertificates(ctx context.Con } expiryTime := time.Unix(int64(expirySecondsUnix), 0) - delete(mAnnotations, bootstrapv1.CertificatesRefreshAnnotation) + mAnnotations[bootstrapv1.CertificatesRefreshStatusAnnotation] = bootstrapv1.CertificatesRefreshDoneStatus mAnnotations[bootstrapv1.MachineCertificatesExpiryDateAnnotation] = expiryTime.Format(time.RFC3339) scope.Machine.SetAnnotations(mAnnotations) + if err := scope.Patcher.Patch(ctx, scope.Machine); err != nil { return fmt.Errorf("failed to patch machine annotations: %w", err) } @@ -230,21 +280,17 @@ func (r *CertificatesReconciler) updateExpiryDateAnnotation(ctx context.Context, return fmt.Errorf("failed to lookup node token: %w", err) } - mAnnotations := scope.Machine.GetAnnotations() - if mAnnotations == nil { - mAnnotations = map[string]string{} - } - expiryDateString, err := scope.Workload.GetCertificatesExpiryDate(ctx, scope.Machine, *nodeToken) if err != nil { return fmt.Errorf("failed to get certificates expiry date: %w", err) } - mAnnotations[bootstrapv1.MachineCertificatesExpiryDateAnnotation] = expiryDateString - scope.Machine.SetAnnotations(mAnnotations) - if err := scope.Patcher.Patch(ctx, scope.Machine); err != nil { - return fmt.Errorf("failed to patch machine annotations: %w", err) + mAnnotations := scope.Machine.GetAnnotations() + if mAnnotations == nil { + mAnnotations = map[string]string{} } - return nil + mAnnotations[bootstrapv1.MachineCertificatesExpiryDateAnnotation] = expiryDateString + scope.Machine.SetAnnotations(mAnnotations) + return scope.Patcher.Patch(ctx, scope.Machine) } diff --git a/go.mod b/go.mod index 4e11c408..8940ca5e 100644 --- a/go.mod +++ b/go.mod @@ -119,7 +119,7 @@ require ( golang.org/x/mod v0.19.0 golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.18.0 // indirect - golang.org/x/sync v0.6.0 // indirect + golang.org/x/sync v0.8.0 golang.org/x/sys v0.18.0 // indirect golang.org/x/term v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index 33ea1288..a69bd9a4 100644 --- a/go.sum +++ b/go.sum @@ -359,8 +359,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/pkg/ck8s/workload_cluster.go b/pkg/ck8s/workload_cluster.go index a52ae00f..63c5f353 100644 --- a/pkg/ck8s/workload_cluster.go +++ b/pkg/ck8s/workload_cluster.go @@ -9,6 +9,7 @@ import ( "strings" apiv1 "github.com/canonical/k8s-snap-api/api/v1" + "golang.org/x/sync/errgroup" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -194,9 +195,7 @@ func (w *Workload) GetCertificatesExpiryDate(ctx context.Context, machine *clust request := apiv1.CertificatesExpiryRequest{} response := &apiv1.CertificatesExpiryResponse{} - header := map[string][]string{ - "node-token": {nodeToken}, - } + header := w.newHeaderWithNodeToken(nodeToken) k8sdProxy, err := w.GetK8sdProxyForMachine(ctx, machine) if err != nil { return "", fmt.Errorf("failed to create k8sd proxy: %w", err) @@ -209,13 +208,30 @@ func (w *Workload) GetCertificatesExpiryDate(ctx context.Context, machine *clust return response.ExpiryDate, nil } -func (w *Workload) RefreshCertificates(ctx context.Context, machine *clusterv1.Machine, nodeToken string, expirationSeconds int, extraSANs []string) (int, error) { +func (w *Workload) ApproveCertificates(ctx context.Context, machine *clusterv1.Machine, seed int) error { + request := apiv1.ClusterAPIApproveWorkerCSRRequest{ + Seed: seed, + } + response := &apiv1.ClusterAPIApproveWorkerCSRResponse{} + k8sdProxy, err := w.GetK8sdProxyForControlPlane(ctx, k8sdProxyOptions{}) + if err != nil { + return fmt.Errorf("failed to create k8sd proxy: %w", err) + } + + header := w.newHeaderWithCAPIAuthToken() + + if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/x/capi/refresh-certs/approve", header, request, response); err != nil { + return fmt.Errorf("failed to approve certificates: %w", err) + } + + return nil +} + +func (w *Workload) refreshCertificatesPlan(ctx context.Context, machine *clusterv1.Machine, nodeToken string) (int, error) { planRequest := apiv1.ClusterAPICertificatesPlanRequest{} planResponse := &apiv1.ClusterAPICertificatesPlanResponse{} - header := map[string][]string{ - "node-token": {nodeToken}, - } + header := w.newHeaderWithNodeToken(nodeToken) k8sdProxy, err := w.GetK8sdProxyForMachine(ctx, machine) if err != nil { @@ -226,17 +242,91 @@ func (w *Workload) RefreshCertificates(ctx context.Context, machine *clusterv1.M return 0, fmt.Errorf("failed to refresh certificates: %w", err) } + return planResponse.Seed, nil +} + +func (w *Workload) refreshCertificatesRun(ctx context.Context, machine *clusterv1.Machine, nodeToken string, request *apiv1.ClusterAPICertificatesRunRequest) (int, error) { + runResponse := &apiv1.ClusterAPICertificatesRunResponse{} + header := w.newHeaderWithNodeToken(nodeToken) + + k8sdProxy, err := w.GetK8sdProxyForMachine(ctx, machine) + if err != nil { + return 0, fmt.Errorf("failed to create k8sd proxy: %w", err) + } + + if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/x/capi/refresh-certs/run", header, request, runResponse); err != nil { + return 0, fmt.Errorf("failed to run refresh certificates: %w", err) + } + + return runResponse.ExpirationSeconds, nil +} + +// RefreshWorkerCertificates approves the worker node CSR and refreshes the certificates. +// The certificate approval process follows these steps: +// 1. The CAPI provider calls the /x/capi/refresh-certs/plan endpoint from the +// worker node, which generates the CSRs and creates the CertificateSigningRequest +// objects in the cluster. +// 2. The CAPI provider then calls the /x/capi/refresh-certs/run endpoint with +// the seed. This endpoint waits until the CSR is approved and the certificate +// is signed. Note that this is a blocking call. +// 3. The CAPI provider calls the /x/capi/refresh-certs/approve endpoint from +// any control plane node to approve the CSRs. +// 4. The /x/capi/refresh-certs/run endpoint completes and returns once the +// certificate is approved and signed. +func (w *Workload) RefreshWorkerCertificates(ctx context.Context, machine *clusterv1.Machine, nodeToken string, expirationSeconds int) (int, error) { + seed, err := w.refreshCertificatesPlan(ctx, machine, nodeToken) + if err != nil { + return 0, fmt.Errorf("failed to get refresh certificates plan: %w", err) + } + + request := apiv1.ClusterAPICertificatesRunRequest{ + Seed: seed, + ExpirationSeconds: expirationSeconds, + } + + var seconds int + + eg, ctx := errgroup.WithContext(ctx) + eg.Go(func() error { + seconds, err = w.refreshCertificatesRun(ctx, machine, nodeToken, &request) + if err != nil { + return fmt.Errorf("failed to run refresh certificates: %w", err) + } + return nil + }) + + eg.Go(func() error { + if err := w.ApproveCertificates(ctx, machine, seed); err != nil { + return fmt.Errorf("failed to approve certificates: %w", err) + } + return nil + }) + + if err := eg.Wait(); err != nil { + return 0, fmt.Errorf("failed to refresh worker certificates: %w", err) + } + + return seconds, nil +} + +func (w *Workload) RefreshControlPlaneCertificates(ctx context.Context, machine *clusterv1.Machine, nodeToken string, expirationSeconds int, extraSANs []string) (int, error) { + seed, err := w.refreshCertificatesPlan(ctx, machine, nodeToken) + if err != nil { + return 0, fmt.Errorf("failed to get refresh certificates plan: %w", err) + } + runRequest := apiv1.ClusterAPICertificatesRunRequest{ ExpirationSeconds: expirationSeconds, - Seed: planResponse.Seed, + Seed: seed, ExtraSANs: extraSANs, } - runResponse := &apiv1.ClusterAPICertificatesRunResponse{} - if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/x/capi/refresh-certs/run", header, runRequest, runResponse); err != nil { + + seconds, err := w.refreshCertificatesRun(ctx, machine, nodeToken, &runRequest) + if err != nil { return 0, fmt.Errorf("failed to run refresh certificates: %w", err) } - return runResponse.ExpirationSeconds, nil + return seconds, nil } func (w *Workload) RefreshMachine(ctx context.Context, machine *clusterv1.Machine, nodeToken string, upgradeOption string) (string, error) { @@ -264,9 +354,7 @@ func (w *Workload) RefreshMachine(ctx context.Context, machine *clusterv1.Machin return "", fmt.Errorf("failed to create k8sd proxy: %w", err) } - header := map[string][]string{ - "node-token": {nodeToken}, - } + header := w.newHeaderWithNodeToken(nodeToken) if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/snap/refresh", header, request, response); err != nil { return "", fmt.Errorf("failed to refresh machine %s: %w", machine.Name, err) @@ -285,9 +373,7 @@ func (w *Workload) GetRefreshStatusForMachine(ctx context.Context, machine *clus return nil, fmt.Errorf("failed to create k8sd proxy: %w", err) } - header := map[string][]string{ - "node-token": {nodeToken}, - } + header := w.newHeaderWithNodeToken(nodeToken) if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/snap/refresh-status", header, request, response); err != nil { return nil, fmt.Errorf("failed to refresh machine %s: %w", machine.Name, err) @@ -320,9 +406,7 @@ func (w *Workload) requestJoinToken(ctx context.Context, name string, worker boo return "", fmt.Errorf("failed to create k8sd proxy: %w", err) } - header := map[string][]string{ - "capi-auth-token": {w.authToken}, - } + header := w.newHeaderWithCAPIAuthToken() if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/x/capi/generate-join-token", header, request, response); err != nil { return "", fmt.Errorf("failed to get join token: %w", err) @@ -349,9 +433,7 @@ func (w *Workload) RemoveMachineFromCluster(ctx context.Context, machine *cluste return fmt.Errorf("failed to create k8sd proxy: %w", err) } - header := map[string][]string{ - "capi-auth-token": {w.authToken}, - } + header := w.newHeaderWithCAPIAuthToken() if err := w.doK8sdRequest(ctx, k8sdProxy, http.MethodPost, "1.0/x/capi/remove-node", header, request, nil); err != nil { return fmt.Errorf("failed to remove %s from cluster: %w", machine.Name, err) @@ -407,6 +489,20 @@ func (w *Workload) doK8sdRequest(ctx context.Context, k8sdProxy *K8sdClient, met return nil } +// newHeaderWithCAPIAuthToken returns a map with the CAPI auth token as a header. +func (w *Workload) newHeaderWithCAPIAuthToken() map[string][]string { + return map[string][]string{ + "capi-auth-token": {w.authToken}, + } +} + +// newHeaderWithNodeToken returns a map with the node token as a header. +func (w *Workload) newHeaderWithNodeToken(nodeToken string) map[string][]string { + return map[string][]string{ + "node-token": {nodeToken}, + } +} + // UpdateAgentConditions is responsible for updating machine conditions reflecting the status of all the control plane // components. This operation is best effort, in the sense that in case // of problems in retrieving the pod status, it sets the condition to Unknown state without returning any error. diff --git a/test/e2e/config/ck8s-docker.yaml b/test/e2e/config/ck8s-docker.yaml index a5175491..8bff3334 100644 --- a/test/e2e/config/ck8s-docker.yaml +++ b/test/e2e/config/ck8s-docker.yaml @@ -105,6 +105,7 @@ intervals: default/wait-nodes-ready: ["10m", "10s"] default/wait-machine-remediation: ["5m", "10s"] default/wait-autoscaler: ["5m", "10s"] + default/wait-machine-refresh: ["5m", "10s"] node-drain/wait-deployment-available: ["3m", "10s"] node-drain/wait-control-plane: ["15m", "10s"] node-drain/wait-machine-deleted: ["2m", "10s"] diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go index 66738ff2..e6d948a3 100644 --- a/test/e2e/helpers.go +++ b/test/e2e/helpers.go @@ -554,6 +554,148 @@ func WaitForControlPlaneAndMachinesReady(ctx context.Context, input WaitForContr }) } +type ApplyCertificateRefreshAndWaitInput struct { + Getter framework.Getter + Machine *clusterv1.Machine + ClusterProxy framework.ClusterProxy + TTL string + WaitForRefreshIntervals []interface{} +} + +func ApplyCertificateRefreshAndWait(ctx context.Context, input ApplyCertificateRefreshAndWaitInput) { + Expect(ctx).NotTo(BeNil()) + Expect(input.Machine).ToNot(BeNil()) + Expect(input.ClusterProxy).ToNot(BeNil()) + Expect(input.TTL).ToNot(BeEmpty()) + + mgmtClient := input.ClusterProxy.GetClient() + + patchHelper, err := patch.NewHelper(input.Machine, mgmtClient) + Expect(err).ToNot(HaveOccurred()) + + mAnnotations := input.Machine.GetAnnotations() + if mAnnotations == nil { + mAnnotations = map[string]string{} + } + + mAnnotations[bootstrapv1.CertificatesRefreshAnnotation] = input.TTL + input.Machine.SetAnnotations(mAnnotations) + err = patchHelper.Patch(ctx, input.Machine) + Expect(err).ToNot(HaveOccurred()) + + By("Waiting for certificates to be refreshed") + Eventually(func() (bool, error) { + machine := &clusterv1.Machine{} + if err := input.Getter.Get(ctx, client.ObjectKey{ + Namespace: input.Machine.Namespace, + Name: input.Machine.Name, + }, machine); err != nil { + return false, err + } + + mAnnotations := machine.GetAnnotations() + if mAnnotations == nil { + return false, nil + } + + status, ok := mAnnotations[bootstrapv1.CertificatesRefreshStatusAnnotation] + if !ok { + return false, nil + } + + if status == bootstrapv1.CertificatesRefreshFailedStatus { + return false, fmt.Errorf("certificates refresh failed for machine %s", machine.Name) + } + + return status == bootstrapv1.CertificatesRefreshDoneStatus, nil + }, input.WaitForRefreshIntervals...).Should(BeTrue(), "Certificates refresh failed for %s", input.Machine.Name) +} + +type ApplyCertificateRefreshForControlPlaneInput struct { + Lister framework.Lister + Getter framework.Getter + ClusterProxy framework.ClusterProxy + Cluster *clusterv1.Cluster + TTL string + WaitForRefreshIntervals []interface{} +} + +func ApplyCertificateRefreshForControlPlane(ctx context.Context, input ApplyCertificateRefreshForControlPlaneInput) { + Expect(ctx).NotTo(BeNil()) + Expect(input.ClusterProxy).ToNot(BeNil()) + Expect(input.Cluster).ToNot(BeNil()) + Expect(input.TTL).ToNot(BeEmpty()) + + By("Looking up control plane machines") + machineList := &clusterv1.MachineList{} + Eventually(func() error { + return input.Lister.List(ctx, machineList, + client.InNamespace(input.Cluster.Namespace), + client.MatchingLabels{ + clusterv1.ClusterNameLabel: input.Cluster.Name, + clusterv1.MachineControlPlaneLabel: "", + }) + }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), + "Failed to list control plane machines for cluster %q", input.Cluster.Name) + + for i := range machineList.Items { + machine := &machineList.Items[i] + By(fmt.Sprintf("Refreshing certificates for control plane machine: %s", machine.Name)) + ApplyCertificateRefreshAndWait(ctx, ApplyCertificateRefreshAndWaitInput{ + Getter: input.Getter, + Machine: machine, + ClusterProxy: input.ClusterProxy, + TTL: input.TTL, + WaitForRefreshIntervals: input.WaitForRefreshIntervals, + }) + } +} + +type ApplyCertificateRefreshForWorkerInput struct { + Lister framework.Lister + Getter framework.Getter + ClusterProxy framework.ClusterProxy + Cluster *clusterv1.Cluster + MachineDeployments []*clusterv1.MachineDeployment + TTL string + WaitForRefreshIntervals []interface{} +} + +func ApplyCertificateRefreshForWorker(ctx context.Context, input ApplyCertificateRefreshForWorkerInput) { + Expect(ctx).NotTo(BeNil()) + Expect(input.ClusterProxy).ToNot(BeNil()) + Expect(input.Cluster).ToNot(BeNil()) + Expect(input.MachineDeployments).ToNot(BeNil()) + Expect(input.TTL).ToNot(BeEmpty()) + + for _, md := range input.MachineDeployments { + By(fmt.Sprintf("Refreshing certificates for machines in deployment %s", md.Name)) + + inClustersNamespaceListOption := client.InNamespace(input.Cluster.Namespace) + matchClusterListOption := client.MatchingLabels{ + clusterv1.ClusterNameLabel: input.Cluster.Name, + clusterv1.MachineDeploymentNameLabel: md.Name, + } + + machineList := &clusterv1.MachineList{} + Eventually(func() error { + return input.Lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption) + }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Couldn't list machines for deployment %q in the cluster %q", md.Name, input.Cluster.Name) + + for i := range machineList.Items { + machine := &machineList.Items[i] + By(fmt.Sprintf("Refreshing certificates for worker machine: %s", machine.Name)) + ApplyCertificateRefreshAndWait(ctx, ApplyCertificateRefreshAndWaitInput{ + Getter: input.Getter, + Machine: machine, + ClusterProxy: input.ClusterProxy, + TTL: input.TTL, + WaitForRefreshIntervals: input.WaitForRefreshIntervals, + }) + } + } +} + type ApplyInPlaceUpgradeAndWaitInput struct { Getter framework.Getter Obj client.Object diff --git a/test/e2e/refresh_certs_test.go b/test/e2e/refresh_certs_test.go new file mode 100644 index 00000000..d28160e3 --- /dev/null +++ b/test/e2e/refresh_certs_test.go @@ -0,0 +1,139 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "fmt" + "path/filepath" + "time" + + bootstrapv1 "github.com/canonical/cluster-api-k8s/bootstrap/api/v1beta2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/ptr" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/test/framework/clusterctl" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = Describe("Certificate Refresh", func() { + var ( + ctx = context.TODO() + specName = "workload-cluster-certificate-refresh" + namespace *corev1.Namespace + cancelWatches context.CancelFunc + result *ApplyClusterTemplateAndWaitResult + clusterName string + clusterctlLogFolder string + infrastructureProvider string + ) + + BeforeEach(func() { + Expect(e2eConfig.Variables).To(HaveKey(KubernetesVersion)) + + clusterName = fmt.Sprintf("capick8s-certificate-refresh-%s", util.RandomString(6)) + infrastructureProvider = clusterctl.DefaultInfrastructureProvider + + // Setup a Namespace where to host objects for this spec and create a watcher for the namespace events. + namespace, cancelWatches = setupSpecNamespace(ctx, specName, bootstrapClusterProxy, artifactFolder) + result = new(ApplyClusterTemplateAndWaitResult) + clusterctlLogFolder = filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()) + }) + + AfterEach(func() { + cleanInput := cleanupInput{ + SpecName: specName, + Cluster: result.Cluster, + ClusterProxy: bootstrapClusterProxy, + Namespace: namespace, + CancelWatches: cancelWatches, + IntervalsGetter: e2eConfig.GetIntervals, + SkipCleanup: skipCleanup, + ArtifactFolder: artifactFolder, + } + + dumpSpecResourcesAndCleanup(ctx, cleanInput) + }) + + Context("Performing certificate refresh", func() { + It("Should successfully refresh certificates for a cluster [PR-Blocking]", func() { + By("Creating a workload cluster with a single control plane and a single worker node") + ApplyClusterTemplateAndWait(ctx, ApplyClusterTemplateAndWaitInput{ + ClusterProxy: bootstrapClusterProxy, + ConfigCluster: clusterctl.ConfigClusterInput{ + LogFolder: clusterctlLogFolder, + ClusterctlConfigPath: clusterctlConfigPath, + KubeconfigPath: bootstrapClusterProxy.GetKubeconfigPath(), + InfrastructureProvider: infrastructureProvider, + Namespace: namespace.Name, + ClusterName: clusterName, + KubernetesVersion: e2eConfig.GetVariable(KubernetesVersion), + ControlPlaneMachineCount: ptr.To(int64(1)), + WorkerMachineCount: ptr.To(int64(1)), + }, + WaitForClusterIntervals: e2eConfig.GetIntervals(specName, "wait-cluster"), + WaitForControlPlaneIntervals: e2eConfig.GetIntervals(specName, "wait-control-plane"), + WaitForMachineDeployments: e2eConfig.GetIntervals(specName, "wait-worker-nodes"), + }, result) + + bootstrapProxyClient := bootstrapClusterProxy.GetClient() + + By("Refreshing certificates for the control plane nodes") + ApplyCertificateRefreshForControlPlane(ctx, ApplyCertificateRefreshForControlPlaneInput{ + Lister: bootstrapProxyClient, + Getter: bootstrapProxyClient, + ClusterProxy: bootstrapClusterProxy, + Cluster: result.Cluster, + TTL: "1y", + WaitForRefreshIntervals: e2eConfig.GetIntervals(specName, "wait-machine-refresh"), + }) + + By("Refreshing certificates for the worker nodes") + ApplyCertificateRefreshForWorker(ctx, ApplyCertificateRefreshForWorkerInput{ + Lister: bootstrapProxyClient, + Getter: bootstrapProxyClient, + ClusterProxy: bootstrapClusterProxy, + Cluster: result.Cluster, + MachineDeployments: result.MachineDeployments, + TTL: "1y", + WaitForRefreshIntervals: e2eConfig.GetIntervals(specName, "wait-machine-refresh"), + }) + + By("Verifying certificates expiry dates are updated") + machineList := &clusterv1.MachineList{} + Expect(bootstrapProxyClient.List(ctx, machineList, + client.InNamespace(result.Cluster.Namespace), + client.MatchingLabels{clusterv1.ClusterNameLabel: result.Cluster.Name}, + )).To(Succeed()) + + for _, machine := range machineList.Items { + mAnnotations := machine.GetAnnotations() + Expect(mAnnotations).To(HaveKey(bootstrapv1.MachineCertificatesExpiryDateAnnotation)) + Expect(mAnnotations[bootstrapv1.CertificatesRefreshStatusAnnotation]).To(Equal(bootstrapv1.CertificatesRefreshDoneStatus)) + + _, err := time.Parse(time.RFC3339, mAnnotations[bootstrapv1.MachineCertificatesExpiryDateAnnotation]) + Expect(err).NotTo(HaveOccurred()) + } + }) + }) +})