From c6eed8707c8f163e1bd560d9b54437f9a497445a Mon Sep 17 00:00:00 2001 From: Abbas Ahmed Date: Wed, 12 Oct 2022 13:54:37 -0400 Subject: [PATCH] Add Allocation Service Metrics (#414) * added allocation time taken metric * fixed syntax errors * added grafana panel for allocations time taken metric * Added metric for allocation retries. * added grafana widgets for game state based metrics * Added 429, 404, and 500 error metrics. * Added 409 error metric. * addressing comments * Revert dashboard.json Co-authored-by: abbasahmed Co-authored-by: abbasahmed Co-authored-by: ghov Co-authored-by: abbasahmed --- .../controllers/allocation_api_server.go | 11 +++++ pkg/operator/controllers/metrics.go | 48 +++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/pkg/operator/controllers/allocation_api_server.go b/pkg/operator/controllers/allocation_api_server.go index 44496fdb..2e6cdf3e 100644 --- a/pkg/operator/controllers/allocation_api_server.go +++ b/pkg/operator/controllers/allocation_api_server.go @@ -280,6 +280,8 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r * return } + timeToAllocateStartTime := time.Now() + // allocation using the heap for i := 0; i < allocationTries; i++ { if i > 0 { @@ -289,6 +291,7 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r * if gs == nil { // pop from queue returned nil, this means no more game servers in this build tooManyRequestsError(w, s.logger, fmt.Errorf("not enough standingBy"), "there are not enough standingBy servers") + Allocations429ErrorsCounter.WithLabelValues(args.BuildID).Inc() return } @@ -316,10 +319,13 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r * if err != nil { if apierrors.IsConflict(err) { s.logger.Info("conflict error patching game server", "error", err, "sessionID", args.SessionID, "buildID", args.BuildID, "retry", i) + Allocations409ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc() } else if apierrors.IsNotFound(err) { s.logger.Info("error not found patching game server", "error", err, "sessionID", args.SessionID, "buildID", args.BuildID, "retry", i) + Allocations404ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc() } else { s.logger.Error(err, "uknown error patching game server", "sessionID", args.SessionID, "buildID", args.BuildID, "retry", i) + Allocations500ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc() } // in case of any error, trigger a reconciliation for this GameServer object // so it's re-added to the queue @@ -339,10 +345,15 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r * err = json.NewEncoder(w).Encode(rs) if err != nil { internalServerError(w, s.logger, err, "encode json response") + Allocations500ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc() return } s.logger.Info("Allocated GameServer", "name", gs2.Name, "sessionID", args.SessionID, "buildID", args.BuildID, "ip", gs2.Status.PublicIP, "ports", gs2.Status.Ports) AllocationsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc() + if i > 0 { + AllocationsRetriesCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc() + } + AllocationsTimeTakenDuration.WithLabelValues(gs2.Labels[LabelBuildName]).Set(float64(time.Since(timeToAllocateStartTime).Milliseconds())) return } diff --git a/pkg/operator/controllers/metrics.go b/pkg/operator/controllers/metrics.go index 13f14aa7..cf2371d9 100644 --- a/pkg/operator/controllers/metrics.go +++ b/pkg/operator/controllers/metrics.go @@ -72,4 +72,52 @@ var ( }, []string{"BuildName"}, ) + AllocationsTimeTakenDuration = registry.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "allocations_time_taken_duration", + Help: "Average time it took to allocate a GameServer", + }, + []string{"BuildName"}, + ) + AllocationsRetriesCounter = registry.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "thundernetes", + Name: "allocations_retried", + Help: "The number of times allocation had to be retried", + }, + []string{"BuildName"}, + ) + Allocations429ErrorsCounter = registry.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "thundernetes", + Name: "allocations_429", + Help: "The number of 429 (too many requests) errors during allocation", + }, + []string{"BuildName"}, + ) + Allocations404ErrorsCounter = registry.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "thundernetes", + Name: "allocations_404", + Help: "The number of 404 (not found) errors during allocation", + }, + []string{"BuildName"}, + ) + Allocations500ErrorsCounter = registry.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "thundernetes", + Name: "allocations_500", + Help: "The number of 500 (internal) errors during allocation", + }, + []string{"BuildName"}, + ) + Allocations409ErrorsCounter = registry.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "thundernetes", + Name: "allocations_409", + Help: "The number of 409 (request conflict) errors during allocation", + }, + []string{"BuildName"}, + ) )