diff --git a/components/supervisor/pkg/metrics/metrics.go b/components/supervisor/pkg/metrics/metrics.go index f188c398904d6f..850a838ca01a8f 100644 --- a/components/supervisor/pkg/metrics/metrics.go +++ b/components/supervisor/pkg/metrics/metrics.go @@ -13,6 +13,8 @@ import ( type SupervisorMetrics struct { IDEReadyDurationTotal *prometheus.HistogramVec InitializerHistogram *prometheus.HistogramVec + SSHTunnelOpenedTotal *prometheus.CounterVec + SSHTunnelClosedTotal *prometheus.CounterVec } func NewMetrics() *SupervisorMetrics { @@ -27,6 +29,14 @@ func NewMetrics() *SupervisorMetrics { Help: "initializer speed in bytes per second", Buckets: prometheus.ExponentialBuckets(1024*1024, 2, 12), }, []string{"kind"}), + SSHTunnelOpenedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "supervisor_ssh_tunnel_opened_total", + Help: "Total number of SSH tunnels opened by the supervisor", + }, []string{}), + SSHTunnelClosedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "supervisor_ssh_tunnel_closed_total", + Help: "Total number of SSH tunnels closed by the supervisor", + }, []string{"code"}), } } @@ -34,6 +44,8 @@ func (m *SupervisorMetrics) Register(registry *prometheus.Registry) error { metrics := []prometheus.Collector{ m.IDEReadyDurationTotal, m.InitializerHistogram, + m.SSHTunnelOpenedTotal, + m.SSHTunnelClosedTotal, } for _, metric := range metrics { diff --git a/components/supervisor/pkg/metrics/reporter.go b/components/supervisor/pkg/metrics/reporter.go index f7cc0e8759d2b6..03e3d4aa46fc95 100644 --- a/components/supervisor/pkg/metrics/reporter.go +++ b/components/supervisor/pkg/metrics/reporter.go @@ -43,6 +43,8 @@ func NewGrpcMetricsReporter(gitpodHost string) *GrpcMetricsReporter { "supervisor_initializer_bytes_second": true, "supervisor_client_handled_total": true, "supervisor_client_handling_seconds": true, + "supervisor_ssh_tunnel_opened_total": true, + "supervisor_ssh_tunnel_closed_total": true, }, values: make(map[string]float64), addCounter: func(name string, labels map[string]string, value uint64) { diff --git a/components/supervisor/pkg/supervisor/supervisor.go b/components/supervisor/pkg/supervisor/supervisor.go index 87a09edd38480e..15c6e371868cdc 100644 --- a/components/supervisor/pkg/supervisor/supervisor.go +++ b/components/supervisor/pkg/supervisor/supervisor.go @@ -23,6 +23,7 @@ import ( "os/exec" "os/signal" "path/filepath" + "regexp" "runtime" "runtime/debug" "strconv" @@ -418,7 +419,7 @@ func Run(options ...RunOption) { } wg.Add(1) - go startAPIEndpoint(ctx, cfg, &wg, apiServices, tunneledPortsService, metricsReporter, apiEndpointOpts...) + go startAPIEndpoint(ctx, cfg, &wg, apiServices, tunneledPortsService, metricsReporter, supervisorMetrics, topService, apiEndpointOpts...) wg.Add(1) go startSSHServer(ctx, cfg, &wg) @@ -1187,7 +1188,28 @@ func isBlacklistedEnvvar(name string) bool { return false } -func startAPIEndpoint(ctx context.Context, cfg *Config, wg *sync.WaitGroup, services []RegisterableService, tunneled *ports.TunneledPortsService, metricsReporter *metrics.GrpcMetricsReporter, opts ...grpc.ServerOption) { +var websocketCloseErrorPattern = regexp.MustCompile(`websocket: close (\d+)`) + +func extractCloseErrorCode(errStr string) string { + matches := websocketCloseErrorPattern.FindStringSubmatch(errStr) + if len(matches) < 2 { + return "unknown" + } + + return matches[1] +} + +func startAPIEndpoint( + ctx context.Context, + cfg *Config, + wg *sync.WaitGroup, + services []RegisterableService, + tunneled *ports.TunneledPortsService, + metricsReporter *metrics.GrpcMetricsReporter, + supervisorMetrics *metrics.SupervisorMetrics, + topService *TopService, + opts ...grpc.ServerOption, +) { defer wg.Done() defer log.Debug("startAPIEndpoint shutdown") @@ -1308,6 +1330,17 @@ func startAPIEndpoint(ctx context.Context, cfg *Config, wg *sync.WaitGroup, serv tunnelOverWebSocket(tunneled, conn) })) routes.Handle("/_supervisor/tunnel/ssh", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + var err error + supervisorMetrics.SSHTunnelOpenedTotal.WithLabelValues().Inc() + defer func() { + code := "unknown" + if err != nil { + code = extractCloseErrorCode(err.Error()) + } + supervisorMetrics.SSHTunnelClosedTotal.WithLabelValues(code).Inc() + }() + startTime := time.Now() + log := log.WithField("userAgent", r.Header.Get("user-agent")).WithField("remoteAddr", r.RemoteAddr) wsConn, err := upgrader.Upgrade(rw, r, nil) if err != nil { log.WithError(err).Error("tunnel ssh: upgrade to the WebSocket protocol failed") @@ -1331,13 +1364,21 @@ func startAPIEndpoint(ctx context.Context, cfg *Config, wg *sync.WaitGroup, serv go io.Copy(conn, conn2) _, err = io.Copy(conn2, conn) - if err != nil && !websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { - log.WithError(err).Error("tunnel ssh: error returned from io.copy") + if err != nil { + var usedCpu, usedMemory int64 + data := topService.data + if data != nil && data.Cpu != nil { + usedCpu = data.Cpu.Used + } + if data != nil && data.Memory != nil { + usedMemory = data.Memory.Used + } + log.WithField("usedCpu", usedCpu).WithField("usedMemory", usedMemory).WithError(err).Error("tunnel ssh: error returned from io.copy") } conn.Close() conn2.Close() - log.Infof("tunnel ssh: Disconnect from %s", conn.RemoteAddr()) + log.WithField("duration", time.Since(startTime).Seconds()).Infof("tunnel ssh: Disconnect from %s", conn.RemoteAddr()) })) if cfg.DebugEnable { routes.Handle("/_supervisor/debug/tunnels", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { diff --git a/install/installer/pkg/components/ide-metrics/configmap.go b/install/installer/pkg/components/ide-metrics/configmap.go index 8c556cf793dd9f..f170f2d701eceb 100644 --- a/install/installer/pkg/components/ide-metrics/configmap.go +++ b/install/installer/pkg/components/ide-metrics/configmap.go @@ -336,6 +336,22 @@ func configmap(ctx *common.RenderContext) ([]runtime.Object, error) { }, }, }, + { + Name: "supervisor_ssh_tunnel_opened_total", + Help: "Total number of SSH tunnels opened by the supervisor", + Labels: []config.LabelAllowList{}, + }, + { + Name: "supervisor_ssh_tunnel_closed_total", + Help: "Total number of SSH tunnels closed by the supervisor", + Labels: []config.LabelAllowList{ + { + Name: "code", + AllowValues: []string{"*"}, + DefaultValue: "unknown", + }, + }, + }, } histogramMetrics := []config.HistogramMetricsConfiguration{