Merge pull request #124 from rishabh-11/refactor

Prober should not consider `Failed` or `Terminating` Machines, `Unhealthy` Nodes.
gardener · Nov 26, 2024 · e54d69f · e54d69f
2 parents 2f9de7f + b5fca63
commit e54d69f
Show file tree

Hide file tree

Showing 58 changed files with 2,205 additions and 3,431 deletions.
diff --git a/cmd/command.go b/cmd/command.go
@@ -18,6 +18,7 @@ const (
 	defaultConcurrentReconciles = 1
 	defaultMetricsBindAddress   = ":9643"
 	defaultHealthBindAddress    = ":9644"
+	defaultPprofBindAddress     = ":8081"
 	defaultLeaseDuration        = 15 * time.Second
 	defaultRenewDeadline        = 10 * time.Second
 	defaultRetryPeriod          = 2 * time.Second
@@ -57,6 +58,8 @@ type SharedOpts struct {
 	MetricsBindAddress string
 	// HealthBindAddress is the TCP address that the controller should bind to for serving health probes
 	HealthBindAddress string
+	// PprofBindAddress is the TCP address that the controller should bind to for serving profiling endpoint.
+	PprofBindAddress string
 }
 
 // LeaderElectionOpts defines the configuration of leader election
@@ -94,6 +97,7 @@ func SetSharedOpts(fs *flag.FlagSet, opts *SharedOpts) {
 	fs.Float64Var(&opts.KubeApiQps, "kube-api-qps", float64(rest.DefaultQPS), "Maximum QPS (queries per second) allowed from the client to the API server")
 	fs.StringVar(&opts.MetricsBindAddress, "metrics-bind-addr", defaultMetricsBindAddress, "The TCP address that the controller should bind to for serving prometheus metrics")
 	fs.StringVar(&opts.HealthBindAddress, "health-bind-addr", defaultHealthBindAddress, "The TCP address that the controller should bind to for serving health probes")
+	fs.StringVar(&opts.PprofBindAddress, "pprof-bind-addr", defaultPprofBindAddress, "The TCP address that the controller should bind to for serving profiling endpoint")
 	bindLeaderElectionFlags(fs, opts)
 }
 

diff --git a/cmd/probercmd.go b/cmd/probercmd.go
@@ -7,6 +7,7 @@ package cmd
 import (
 	"flag"
 	"fmt"
+	machinev1alpha1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/server"
 
 	"github.com/gardener/dependency-watchdog/controllers/cluster"
@@ -76,6 +77,7 @@ func init() {
 	localSchemeBuilder := runtime.NewSchemeBuilder(
 		clientgoscheme.AddToScheme,
 		extensionsv1alpha1.AddToScheme,
+		machinev1alpha1.AddToScheme,
 	)
 	utilruntime.Must(localSchemeBuilder.AddToScheme(scheme))
 }
@@ -107,6 +109,7 @@ func startClusterControllerMgr(logger logr.Logger) (manager.Manager, error) {
 		LeaderElectionResourceLock: resourcelock.LeasesResourceLock,
 		LeaderElectionID:           proberLeaderElectionID,
 		Logger:                     proberLogger,
+		PprofBindAddress:           proberOpts.SharedOpts.PprofBindAddress,
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to start the prober controller manager %w", err)

diff --git a/cmd/weedercmd.go b/cmd/weedercmd.go
@@ -76,8 +76,8 @@ func startEndpointsControllerMgr(logger logr.Logger) (manager.Manager, error) {
 	}
 
 	restConf := ctrl.GetConfigOrDie()
-	restConf.QPS = float32(proberOpts.KubeApiQps)
-	restConf.Burst = proberOpts.KubeApiBurst
+	restConf.QPS = float32(weederOpts.KubeApiQps)
+	restConf.Burst = weederOpts.KubeApiBurst
 	mgr, err := ctrl.NewManager(restConf, ctrl.Options{
 		Scheme:                     scheme,
 		Metrics:                    server.Options{BindAddress: weederOpts.SharedOpts.MetricsBindAddress},
@@ -90,6 +90,7 @@ func startEndpointsControllerMgr(logger logr.Logger) (manager.Manager, error) {
 		LeaderElectionResourceLock: resourcelock.LeasesResourceLock,
 		LeaderElectionID:           weederLeaderElectionID,
 		Logger:                     weederLogger,
+		PprofBindAddress:           weederOpts.SharedOpts.PprofBindAddress,
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to start the weeder controller manager %w", err)

diff --git a/controllers/cluster/cluster_controller.go b/controllers/cluster/cluster_controller.go
@@ -8,8 +8,11 @@ import (
 	"context"
 	"fmt"
 
+	"github.com/gardener/dependency-watchdog/internal/util"
+
 	papi "github.com/gardener/dependency-watchdog/api/prober"
 	"github.com/gardener/dependency-watchdog/internal/prober/scaler"
+	shootclient "github.com/gardener/dependency-watchdog/internal/prober/shoot"
 	"github.com/go-logr/logr"
 	"sigs.k8s.io/controller-runtime/pkg/controller"
 	"sigs.k8s.io/controller-runtime/pkg/handler"
@@ -34,7 +37,7 @@ const controllerName = "cluster"
 
 // Reconciler reconciles a Cluster object
 type Reconciler struct {
-	client.Client
+	Client client.Client
 	// Scheme is the controller-runtime scheme used to initialize the controller manager and to validate the probe config
 	Scheme *runtime.Scheme
 	// ProberMgr is interface to manage lifecycle of probers.
@@ -73,51 +76,25 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 		return ctrl.Result{}, fmt.Errorf("error extracting shoot from cluster: %w", err)
 	}
 
-	// If shoot is marked for deletion then any existing probes will be unregistered
-	if shoot.DeletionTimestamp != nil {
-		if r.ProberMgr.Unregister(req.Name) {
-			log.Info("Cluster has been marked for deletion, existing prober has been removed")
-		}
-		return ctrl.Result{}, nil
-	}
-
-	// if hibernation is enabled then we will remove any existing prober. Any resource scaling that is required in case of hibernation will now be handled as part of worker reconciliation in extension controllers.
-	if v1beta1helper.HibernationIsEnabled(shoot) {
-		if r.ProberMgr.Unregister(req.Name) {
-			log.Info("Cluster hibernation is enabled, existing prober has been removed")
-		}
-		return ctrl.Result{}, nil
-	}
-
-	// if control plane migration has started for a shoot, then any existing probe should be removed as it is no longer needed.
-	if shoot.Status.LastOperation != nil && shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeMigrate {
-		if r.ProberMgr.Unregister(req.Name) {
-			log.Info("Cluster migration is enabled, existing prober has been removed")
-		}
-		return ctrl.Result{}, nil
-	}
+	shootControlNamespace := cluster.Name
 
-	// if a shoot is created without any workers (this can only happen for control-plane-as-a-service use case), then if there is a probe registered then
-	// unregister the probe and return early. If there is no existing probe registered then return early.
-	if len(shoot.Spec.Provider.Workers) == 0 {
-		if r.ProberMgr.Unregister(req.Name) {
-			log.Info("Cluster does not have any workers. An existing probe has been removed")
-		} else {
-			log.Info("Cluster does not have any workers. No probe will be created")
+	if shouldStopProber(shoot, log) {
+		if r.ProberMgr.Unregister(shootControlNamespace) {
+			log.Info("Existing prober has been removed")
 		}
 		return ctrl.Result{}, nil
 	}
 
-	if canStartProber(shoot) {
-		r.startProber(ctx, shoot, log, req.Name)
+	if canStartProber(shoot, log) {
+		r.startProber(ctx, shootControlNamespace, shoot, log)
 	}
 	return ctrl.Result{}, nil
 }
 
-// getCluster will retrieve the cluster object given the namespace and name Not found is not treated as an error and is handled differently in the caller
+// getCluster will retrieve the cluster object given the namespace and name. Cluster not found is not treated as an error and is handled differently in the caller
 func (r *Reconciler) getCluster(ctx context.Context, namespace string, name string) (cluster *extensionsv1alpha1.Cluster, notFound bool, err error) {
 	cluster = &extensionsv1alpha1.Cluster{}
-	if err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, cluster); err != nil {
+	if err := r.Client.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, cluster); err != nil {
 		if errors.IsNotFound(err) {
 			return nil, true, nil
 		}
@@ -126,40 +103,32 @@ func (r *Reconciler) getCluster(ctx context.Context, namespace string, name stri
 	return cluster, false, nil
 }
 
-// canStartProber checks if a probe can be registered and started.
-// shoot.Status.LastOperation.Type provides an insight into the current state of the cluster. It is important to identify the following cases:
-// 1. Cluster has been created successfully => This will ensure that the current state of shoot Kube API Server can be acted upon to decide on scaling operations. If the cluster
-// is in the process of creation, then it is possible that the control plane components have not completely come up. If the probe starts prematurely then it could start to scale down resources.
-// 2. During control plane migration, the value of shoot.Status.LastOperation.Type will be "Restore" => During this time it is imperative that probe is started early to ensure
-// that MCM is scaled down in case connectivity to the Kube API server of the shoot on the destination seed is broken, else it will try and recreate machines.
-// If the shoot.Status.LastOperation.Type == "Reconcile" then it is assumed that the cluster has been successfully created at-least once, and it is safe to start the probe.
-func canStartProber(shoot *v1beta1.Shoot) bool {
-	if shoot.Status.IsHibernated || shoot.Status.LastOperation == nil {
-		return false
-	}
-	if shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeReconcile ||
-		(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeRestore && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) ||
-		(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeCreate && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) {
-		return true
-	}
-	return false
-}
-
 // startProber sets up a new probe against a given key which uniquely identifies the probe.
 // Typically, the key in case of a shoot cluster is the shoot namespace
-func (r *Reconciler) startProber(ctx context.Context, shoot *v1beta1.Shoot, logger logr.Logger, key string) {
-	_, ok := r.ProberMgr.GetProber(key)
+func (r *Reconciler) startProber(ctx context.Context, shootControlNs string, shoot *v1beta1.Shoot, logger logr.Logger) {
+	workerNodeConditions := util.GetEffectiveNodeConditionsForWorkers(shoot)
+	existingProber, ok := r.ProberMgr.GetProber(shootControlNs)
 	if !ok {
-		probeConfig := r.getEffectiveProbeConfig(shoot, logger)
-		deploymentScaler := scaler.NewScaler(key, probeConfig.DependentResourceInfos, r.Client, r.ScaleGetter, logger)
-		shootClientCreator := prober.NewShootClientCreator(r.Client)
-		p := prober.NewProber(ctx, key, probeConfig, deploymentScaler, shootClientCreator, logger)
-		r.ProberMgr.Register(*p)
-		logger.Info("Starting a new prober")
-		go p.Run()
+		r.createAndRunProber(ctx, shootControlNs, shoot, workerNodeConditions, logger)
+	} else {
+		if existingProber.AreWorkerNodeConditionsStale(workerNodeConditions) {
+			logger.Info("Restarting prober due to change in node conditions for workers")
+			_ = r.ProberMgr.Unregister(shootControlNs)
+			r.createAndRunProber(ctx, shootControlNs, shoot, workerNodeConditions, logger)
+		}
 	}
 }
 
+func (r *Reconciler) createAndRunProber(ctx context.Context, shootNamespace string, shoot *v1beta1.Shoot, workerNodeConditions map[string][]string, logger logr.Logger) {
+	probeConfig := r.getEffectiveProbeConfig(shoot, logger)
+	deploymentScaler := scaler.NewScaler(shootNamespace, probeConfig.DependentResourceInfos, r.Client, r.ScaleGetter, logger)
+	shootClientCreator := shootclient.NewClientCreator(shootNamespace, probeConfig.KubeConfigSecretName, r.Client)
+	p := prober.NewProber(ctx, r.Client, shootNamespace, probeConfig, workerNodeConditions, deploymentScaler, shootClientCreator, logger)
+	r.ProberMgr.Register(*p)
+	logger.Info("Starting a new prober")
+	go p.Run()
+}
+
 // SetupWithManager sets up the controller with the Manager.
 func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
 	c, err := controller.New(
@@ -186,3 +155,55 @@ func (r *Reconciler) getEffectiveProbeConfig(shoot *v1beta1.Shoot, logger logr.L
 	}
 	return &probeConfig
 }
+
+func shouldStopProber(shoot *v1beta1.Shoot, logger logr.Logger) bool {
+	// If shoot is marked for deletion then any existing probes will be unregistered
+	if shoot.DeletionTimestamp != nil {
+		logger.Info("Cluster has been marked for deletion, existing prober if any will be removed")
+		return true
+	}
+
+	// if hibernation is enabled then we will remove any existing prober. Any resource scaling that is required in case of hibernation will now be handled as part of worker reconciliation in extension controllers.
+	if v1beta1helper.HibernationIsEnabled(shoot) {
+		logger.Info("Cluster hibernation is enabled, existing prober if any will be removed")
+		return true
+	}
+
+	// if control plane migration has started for a shoot, then any existing probe should be removed as it is no longer needed.
+	if shoot.Status.LastOperation != nil && shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeMigrate {
+		logger.Info("Cluster migration is enabled, existing prober if any will be removed")
+		return true
+	}
+
+	// if a shoot is created without any workers (this can only happen for control-plane-as-a-service use case), then any existing probe should be removed as it is no longer needed.
+	if len(shoot.Spec.Provider.Workers) == 0 {
+		logger.Info("Cluster does not have any workers, existing prober if any will be removed")
+		return true
+	}
+	return false
+}
+
+// canStartProber checks if a probe can be registered and started.
+// shoot.Status.LastOperation.Type provides an insight into the current state of the cluster. It is important to identify the following cases:
+// 1. Cluster has been created successfully => This will ensure that the current state of shoot Kube API Server can be acted upon to decide on scaling operations. If the cluster
+// is in the process of creation, then it is possible that the control plane components have not completely come up. If the probe starts prematurely then it could start to scale down resources.
+// 2. During control plane migration, the value of shoot.Status.LastOperation.Type will be "Restore" => During this time it is imperative that probe is started early to ensure
+// that MCM is scaled down in case connectivity to the Kube API server of the shoot on the destination seed is broken, else it will try and recreate machines.
+// If the shoot.Status.LastOperation.Type == "Reconcile" then it is assumed that the cluster has been successfully created at-least once, and it is safe to start the probe.
+func canStartProber(shoot *v1beta1.Shoot, logger logr.Logger) bool {
+	if !v1beta1helper.HibernationIsEnabled(shoot) && shoot.Status.IsHibernated {
+		logger.Info("Cannot start probe. Cluster is waking up from hibernation")
+		return false
+	}
+	if shoot.Status.LastOperation == nil {
+		logger.Info("Cannot start probe. Cluster is creation phase")
+		return false
+	}
+	if shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeReconcile ||
+		(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeRestore && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) ||
+		(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeCreate && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) {
+		return true
+	}
+	logger.Info("Cannot start probe. Cluster is either in migration/restore or in creation phase")
+	return false
+}