Skip to content

Commit

Permalink
Merge pull request #124 from rishabh-11/refactor
Browse files Browse the repository at this point in the history
Prober should not consider `Failed` or `Terminating` Machines, `Unhealthy` Nodes.
  • Loading branch information
unmarshall authored Nov 26, 2024
2 parents 2f9de7f + b5fca63 commit e54d69f
Show file tree
Hide file tree
Showing 58 changed files with 2,205 additions and 3,431 deletions.
4 changes: 4 additions & 0 deletions cmd/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ const (
defaultConcurrentReconciles = 1
defaultMetricsBindAddress = ":9643"
defaultHealthBindAddress = ":9644"
defaultPprofBindAddress = ":8081"
defaultLeaseDuration = 15 * time.Second
defaultRenewDeadline = 10 * time.Second
defaultRetryPeriod = 2 * time.Second
Expand Down Expand Up @@ -57,6 +58,8 @@ type SharedOpts struct {
MetricsBindAddress string
// HealthBindAddress is the TCP address that the controller should bind to for serving health probes
HealthBindAddress string
// PprofBindAddress is the TCP address that the controller should bind to for serving profiling endpoint.
PprofBindAddress string
}

// LeaderElectionOpts defines the configuration of leader election
Expand Down Expand Up @@ -94,6 +97,7 @@ func SetSharedOpts(fs *flag.FlagSet, opts *SharedOpts) {
fs.Float64Var(&opts.KubeApiQps, "kube-api-qps", float64(rest.DefaultQPS), "Maximum QPS (queries per second) allowed from the client to the API server")
fs.StringVar(&opts.MetricsBindAddress, "metrics-bind-addr", defaultMetricsBindAddress, "The TCP address that the controller should bind to for serving prometheus metrics")
fs.StringVar(&opts.HealthBindAddress, "health-bind-addr", defaultHealthBindAddress, "The TCP address that the controller should bind to for serving health probes")
fs.StringVar(&opts.PprofBindAddress, "pprof-bind-addr", defaultPprofBindAddress, "The TCP address that the controller should bind to for serving profiling endpoint")
bindLeaderElectionFlags(fs, opts)
}

Expand Down
3 changes: 3 additions & 0 deletions cmd/probercmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package cmd
import (
"flag"
"fmt"
machinev1alpha1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"

"github.com/gardener/dependency-watchdog/controllers/cluster"
Expand Down Expand Up @@ -76,6 +77,7 @@ func init() {
localSchemeBuilder := runtime.NewSchemeBuilder(
clientgoscheme.AddToScheme,
extensionsv1alpha1.AddToScheme,
machinev1alpha1.AddToScheme,
)
utilruntime.Must(localSchemeBuilder.AddToScheme(scheme))
}
Expand Down Expand Up @@ -107,6 +109,7 @@ func startClusterControllerMgr(logger logr.Logger) (manager.Manager, error) {
LeaderElectionResourceLock: resourcelock.LeasesResourceLock,
LeaderElectionID: proberLeaderElectionID,
Logger: proberLogger,
PprofBindAddress: proberOpts.SharedOpts.PprofBindAddress,
})
if err != nil {
return nil, fmt.Errorf("failed to start the prober controller manager %w", err)
Expand Down
5 changes: 3 additions & 2 deletions cmd/weedercmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ func startEndpointsControllerMgr(logger logr.Logger) (manager.Manager, error) {
}

restConf := ctrl.GetConfigOrDie()
restConf.QPS = float32(proberOpts.KubeApiQps)
restConf.Burst = proberOpts.KubeApiBurst
restConf.QPS = float32(weederOpts.KubeApiQps)
restConf.Burst = weederOpts.KubeApiBurst
mgr, err := ctrl.NewManager(restConf, ctrl.Options{
Scheme: scheme,
Metrics: server.Options{BindAddress: weederOpts.SharedOpts.MetricsBindAddress},
Expand All @@ -90,6 +90,7 @@ func startEndpointsControllerMgr(logger logr.Logger) (manager.Manager, error) {
LeaderElectionResourceLock: resourcelock.LeasesResourceLock,
LeaderElectionID: weederLeaderElectionID,
Logger: weederLogger,
PprofBindAddress: weederOpts.SharedOpts.PprofBindAddress,
})
if err != nil {
return nil, fmt.Errorf("failed to start the weeder controller manager %w", err)
Expand Down
147 changes: 84 additions & 63 deletions controllers/cluster/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ import (
"context"
"fmt"

"github.com/gardener/dependency-watchdog/internal/util"

papi "github.com/gardener/dependency-watchdog/api/prober"
"github.com/gardener/dependency-watchdog/internal/prober/scaler"
shootclient "github.com/gardener/dependency-watchdog/internal/prober/shoot"
"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
Expand All @@ -34,7 +37,7 @@ const controllerName = "cluster"

// Reconciler reconciles a Cluster object
type Reconciler struct {
client.Client
Client client.Client
// Scheme is the controller-runtime scheme used to initialize the controller manager and to validate the probe config
Scheme *runtime.Scheme
// ProberMgr is interface to manage lifecycle of probers.
Expand Down Expand Up @@ -73,51 +76,25 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
return ctrl.Result{}, fmt.Errorf("error extracting shoot from cluster: %w", err)
}

// If shoot is marked for deletion then any existing probes will be unregistered
if shoot.DeletionTimestamp != nil {
if r.ProberMgr.Unregister(req.Name) {
log.Info("Cluster has been marked for deletion, existing prober has been removed")
}
return ctrl.Result{}, nil
}

// if hibernation is enabled then we will remove any existing prober. Any resource scaling that is required in case of hibernation will now be handled as part of worker reconciliation in extension controllers.
if v1beta1helper.HibernationIsEnabled(shoot) {
if r.ProberMgr.Unregister(req.Name) {
log.Info("Cluster hibernation is enabled, existing prober has been removed")
}
return ctrl.Result{}, nil
}

// if control plane migration has started for a shoot, then any existing probe should be removed as it is no longer needed.
if shoot.Status.LastOperation != nil && shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeMigrate {
if r.ProberMgr.Unregister(req.Name) {
log.Info("Cluster migration is enabled, existing prober has been removed")
}
return ctrl.Result{}, nil
}
shootControlNamespace := cluster.Name

// if a shoot is created without any workers (this can only happen for control-plane-as-a-service use case), then if there is a probe registered then
// unregister the probe and return early. If there is no existing probe registered then return early.
if len(shoot.Spec.Provider.Workers) == 0 {
if r.ProberMgr.Unregister(req.Name) {
log.Info("Cluster does not have any workers. An existing probe has been removed")
} else {
log.Info("Cluster does not have any workers. No probe will be created")
if shouldStopProber(shoot, log) {
if r.ProberMgr.Unregister(shootControlNamespace) {
log.Info("Existing prober has been removed")
}
return ctrl.Result{}, nil
}

if canStartProber(shoot) {
r.startProber(ctx, shoot, log, req.Name)
if canStartProber(shoot, log) {
r.startProber(ctx, shootControlNamespace, shoot, log)
}
return ctrl.Result{}, nil
}

// getCluster will retrieve the cluster object given the namespace and name Not found is not treated as an error and is handled differently in the caller
// getCluster will retrieve the cluster object given the namespace and name. Cluster not found is not treated as an error and is handled differently in the caller
func (r *Reconciler) getCluster(ctx context.Context, namespace string, name string) (cluster *extensionsv1alpha1.Cluster, notFound bool, err error) {
cluster = &extensionsv1alpha1.Cluster{}
if err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, cluster); err != nil {
if err := r.Client.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, cluster); err != nil {
if errors.IsNotFound(err) {
return nil, true, nil
}
Expand All @@ -126,40 +103,32 @@ func (r *Reconciler) getCluster(ctx context.Context, namespace string, name stri
return cluster, false, nil
}

// canStartProber checks if a probe can be registered and started.
// shoot.Status.LastOperation.Type provides an insight into the current state of the cluster. It is important to identify the following cases:
// 1. Cluster has been created successfully => This will ensure that the current state of shoot Kube API Server can be acted upon to decide on scaling operations. If the cluster
// is in the process of creation, then it is possible that the control plane components have not completely come up. If the probe starts prematurely then it could start to scale down resources.
// 2. During control plane migration, the value of shoot.Status.LastOperation.Type will be "Restore" => During this time it is imperative that probe is started early to ensure
// that MCM is scaled down in case connectivity to the Kube API server of the shoot on the destination seed is broken, else it will try and recreate machines.
// If the shoot.Status.LastOperation.Type == "Reconcile" then it is assumed that the cluster has been successfully created at-least once, and it is safe to start the probe.
func canStartProber(shoot *v1beta1.Shoot) bool {
if shoot.Status.IsHibernated || shoot.Status.LastOperation == nil {
return false
}
if shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeReconcile ||
(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeRestore && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) ||
(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeCreate && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) {
return true
}
return false
}

// startProber sets up a new probe against a given key which uniquely identifies the probe.
// Typically, the key in case of a shoot cluster is the shoot namespace
func (r *Reconciler) startProber(ctx context.Context, shoot *v1beta1.Shoot, logger logr.Logger, key string) {
_, ok := r.ProberMgr.GetProber(key)
func (r *Reconciler) startProber(ctx context.Context, shootControlNs string, shoot *v1beta1.Shoot, logger logr.Logger) {
workerNodeConditions := util.GetEffectiveNodeConditionsForWorkers(shoot)
existingProber, ok := r.ProberMgr.GetProber(shootControlNs)
if !ok {
probeConfig := r.getEffectiveProbeConfig(shoot, logger)
deploymentScaler := scaler.NewScaler(key, probeConfig.DependentResourceInfos, r.Client, r.ScaleGetter, logger)
shootClientCreator := prober.NewShootClientCreator(r.Client)
p := prober.NewProber(ctx, key, probeConfig, deploymentScaler, shootClientCreator, logger)
r.ProberMgr.Register(*p)
logger.Info("Starting a new prober")
go p.Run()
r.createAndRunProber(ctx, shootControlNs, shoot, workerNodeConditions, logger)
} else {
if existingProber.AreWorkerNodeConditionsStale(workerNodeConditions) {
logger.Info("Restarting prober due to change in node conditions for workers")
_ = r.ProberMgr.Unregister(shootControlNs)
r.createAndRunProber(ctx, shootControlNs, shoot, workerNodeConditions, logger)
}
}
}

func (r *Reconciler) createAndRunProber(ctx context.Context, shootNamespace string, shoot *v1beta1.Shoot, workerNodeConditions map[string][]string, logger logr.Logger) {
probeConfig := r.getEffectiveProbeConfig(shoot, logger)
deploymentScaler := scaler.NewScaler(shootNamespace, probeConfig.DependentResourceInfos, r.Client, r.ScaleGetter, logger)
shootClientCreator := shootclient.NewClientCreator(shootNamespace, probeConfig.KubeConfigSecretName, r.Client)
p := prober.NewProber(ctx, r.Client, shootNamespace, probeConfig, workerNodeConditions, deploymentScaler, shootClientCreator, logger)
r.ProberMgr.Register(*p)
logger.Info("Starting a new prober")
go p.Run()
}

// SetupWithManager sets up the controller with the Manager.
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
c, err := controller.New(
Expand All @@ -186,3 +155,55 @@ func (r *Reconciler) getEffectiveProbeConfig(shoot *v1beta1.Shoot, logger logr.L
}
return &probeConfig
}

func shouldStopProber(shoot *v1beta1.Shoot, logger logr.Logger) bool {
// If shoot is marked for deletion then any existing probes will be unregistered
if shoot.DeletionTimestamp != nil {
logger.Info("Cluster has been marked for deletion, existing prober if any will be removed")
return true
}

// if hibernation is enabled then we will remove any existing prober. Any resource scaling that is required in case of hibernation will now be handled as part of worker reconciliation in extension controllers.
if v1beta1helper.HibernationIsEnabled(shoot) {
logger.Info("Cluster hibernation is enabled, existing prober if any will be removed")
return true
}

// if control plane migration has started for a shoot, then any existing probe should be removed as it is no longer needed.
if shoot.Status.LastOperation != nil && shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeMigrate {
logger.Info("Cluster migration is enabled, existing prober if any will be removed")
return true
}

// if a shoot is created without any workers (this can only happen for control-plane-as-a-service use case), then any existing probe should be removed as it is no longer needed.
if len(shoot.Spec.Provider.Workers) == 0 {
logger.Info("Cluster does not have any workers, existing prober if any will be removed")
return true
}
return false
}

// canStartProber checks if a probe can be registered and started.
// shoot.Status.LastOperation.Type provides an insight into the current state of the cluster. It is important to identify the following cases:
// 1. Cluster has been created successfully => This will ensure that the current state of shoot Kube API Server can be acted upon to decide on scaling operations. If the cluster
// is in the process of creation, then it is possible that the control plane components have not completely come up. If the probe starts prematurely then it could start to scale down resources.
// 2. During control plane migration, the value of shoot.Status.LastOperation.Type will be "Restore" => During this time it is imperative that probe is started early to ensure
// that MCM is scaled down in case connectivity to the Kube API server of the shoot on the destination seed is broken, else it will try and recreate machines.
// If the shoot.Status.LastOperation.Type == "Reconcile" then it is assumed that the cluster has been successfully created at-least once, and it is safe to start the probe.
func canStartProber(shoot *v1beta1.Shoot, logger logr.Logger) bool {
if !v1beta1helper.HibernationIsEnabled(shoot) && shoot.Status.IsHibernated {
logger.Info("Cannot start probe. Cluster is waking up from hibernation")
return false
}
if shoot.Status.LastOperation == nil {
logger.Info("Cannot start probe. Cluster is creation phase")
return false
}
if shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeReconcile ||
(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeRestore && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) ||
(shoot.Status.LastOperation.Type == v1beta1.LastOperationTypeCreate && shoot.Status.LastOperation.State == v1beta1.LastOperationStateSucceeded) {
return true
}
logger.Info("Cannot start probe. Cluster is either in migration/restore or in creation phase")
return false
}
Loading

0 comments on commit e54d69f

Please sign in to comment.