From b97d2b6e6c61ff86c7fa27a81b3f5174ea31263b Mon Sep 17 00:00:00 2001 From: Paulin Todev Date: Wed, 11 Dec 2024 15:54:01 +0000 Subject: [PATCH] Add a `/-/healthy` endpoint for monitoring component health (#2197) * Add an endpoint for monitoring health --------- Co-authored-by: Clayton Cornell <131809008+clayton-cornell@users.noreply.github.com> --- CHANGELOG.md | 2 + docs/sources/reference/_index.md | 2 +- docs/sources/reference/http/_index.md | 78 +++++++++++++++++++++++++ internal/service/http/http.go | 26 +++++++++ internal/service/http/http_test.go | 82 +++++++++++++++++++++++++-- 5 files changed, 184 insertions(+), 6 deletions(-) create mode 100644 docs/sources/reference/http/_index.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ec8ba4d78d..439e9610cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ Main (unreleased) - Add `otelcol.receiver.influxdb` to convert influx metric into OTEL. (@EHSchmitt4395) +- Add a new `/-/healthy` endpoint which returns HTTP 500 if one or more components are unhealthy. (@ptodev) + ### Enhancements - Add second metrics sample to the support bundle to provide delta information (@dehaansa) diff --git a/docs/sources/reference/_index.md b/docs/sources/reference/_index.md index 7969d0ea58..d98ca48aa0 100644 --- a/docs/sources/reference/_index.md +++ b/docs/sources/reference/_index.md @@ -1,6 +1,6 @@ --- canonical: https://grafana.com/docs/alloy/latest/reference/ -description: The reference-level documentaiton for Grafana Aloy +description: The reference-level documentation for Grafana Alloy menuTitle: Reference title: Grafana Alloy Reference weight: 600 diff --git a/docs/sources/reference/http/_index.md b/docs/sources/reference/http/_index.md new file mode 100644 index 0000000000..f2f433e7b2 --- /dev/null +++ b/docs/sources/reference/http/_index.md @@ -0,0 +1,78 @@ +--- +canonical: https://grafana.com/docs/alloy/latest/reference/http/ +description: Learn about HTTP endpoints exposed by Grafana Alloy +title: The Grafana Alloy HTTP endpoints +menuTitle: HTTP endpoints +weight: 700 +--- + +# The {{% param "FULL_PRODUCT_NAME" %}} HTTP endpoints + +{{< param "FULL_PRODUCT_NAME" >}} has several default HTTP endpoints that are available by default regardless of which components you have configured. +You can use these HTTP endpoints to monitor, health check, and troubleshoot {{< param "PRODUCT_NAME" >}}. + +The HTTP server which exposes them is configured via the [http block](../config-blocks/http) +and the `--server.` [command line arguments](../cli/run). +For example, if you set the `--server.http.listen-addr` command line argument to `127.0.0.1:12345`, +you can query the `127.0.0.1:12345/metrics` endpoint to see the internal metrics of {{< param "PRODUCT_NAME" >}}. + +### /metrics + +The `/metrics` endpoint returns the internal metrics of {{< param "PRODUCT_NAME" >}} in the Prometheus exposition format. + +### /-/ready + +An {{< param "PRODUCT_NAME" >}} instance is ready once it has loaded its initial configuration. +If the instance is ready, the `/-/ready` endpoint returns `HTTP 200 OK` and the message `Alloy is ready.` +Otherwise, if the instance is not ready, the `/-/ready` endpoint returns `HTTP 503 Service Unavailable` and the message `Alloy is not ready.` + +### /-/healthy + +When all {{< param "PRODUCT_NAME" >}} components are working correctly, all components are considered healthy. +If all components are healthy, the `/-/healthy` endpoint returns `HTTP 200 OK` and the message `All Alloy components are healthy.`. +Otherwise, if any of the components are not working correctly, the `/-/healthy` endpoint returns `HTTP 500 Internal Server Error` and an error message. +You can also monitor component health through the {{< param "PRODUCT_NAME" >}} [UI](../../troubleshoot/debug#alloy-ui). + +```shell +$ curl localhost:12345/-/healthy +All Alloy components are healthy. +``` + +```shell +$ curl localhost:12345/-/healthy +unhealthy components: math.add +``` + +{{< admonition type="note" >}} +The `/-/healthy` endpoint isn't suitable for a [Kubernetes liveness probe][k8s-liveness]. + +An {{< param "PRODUCT_NAME" >}} instance that reports as unhealthy should not necessarily be restarted. +For example, a component may be unhealthy due to an invalid configuration or an unavailable external resource. +In this case, restarting {{< param "PRODUCT_NAME" >}} would not fix the problem. +A restart may make it worse, because it would could stop the flow of telemetry in healthy pipelines. + +[k8s-liveness]: https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/ +{{< /admonition >}} + +### /-/reload + +The `/-/reload` endpoint reloads the {{< param "PRODUCT_NAME" >}} configuration file. +If the configuration file can't be reloaded, the `/-/reload` endpoint returns `HTTP 400 Bad Request` and an error message. + +```shell +$ curl localhost:12345/-/reload +config reloaded +``` + +```shell +$ curl localhost:12345/-/reload +error during the initial load: /Users/user1/Desktop/git.alloy:13:1: Failed to build component: loading custom component controller: custom component config not found in the registry, namespace: "math", componentName: "add" +``` + +### /-/support + +The `/-/support` endpoint returns a [support bundle](../../troubleshoot/support_bundle) that contains information about your {{< param "PRODUCT_NAME" >}} instance. You can use this information as a baseline when debugging an issue. + +### /debug/pprof + +The `/debug/pprof` endpoint returns a pprof Go [profile](../../troubleshoot/profile) that you can use to visualize and analyze profiling data. diff --git a/internal/service/http/http.go b/internal/service/http/http.go index 61425033ec..b20d0442cc 100644 --- a/internal/service/http/http.go +++ b/internal/service/http/http.go @@ -188,6 +188,32 @@ func (s *Service) Run(ctx context.Context, host service.Host) error { otelmux.WithTracerProvider(s.tracer), )) + // The implementation for "/-/healthy" is inspired by + // the "/components" web API endpoint in /internal/web/api/api.go + r.HandleFunc("/-/healthy", func(w http.ResponseWriter, r *http.Request) { + components, err := host.ListComponents("", component.InfoOptions{ + GetHealth: true, + }) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + unhealthyComponents := []string{} + for _, c := range components { + if c.Health.Health == component.HealthTypeUnhealthy { + unhealthyComponents = append(unhealthyComponents, c.ComponentName) + } + } + if len(unhealthyComponents) > 0 { + http.Error(w, "unhealthy components: "+strings.Join(unhealthyComponents, ", "), http.StatusInternalServerError) + return + } + + fmt.Fprintln(w, "All Alloy components are healthy.") + w.WriteHeader(http.StatusOK) + }) + r.Handle( "/metrics", promhttp.HandlerFor(s.gatherer, promhttp.HandlerOpts{}), diff --git a/internal/service/http/http_test.go b/internal/service/http/http_test.go index 2481fcd6cb..52da9adfcd 100644 --- a/internal/service/http/http_test.go +++ b/internal/service/http/http_test.go @@ -3,6 +3,7 @@ package http import ( "context" "fmt" + "io" "net/http" "testing" @@ -43,6 +44,26 @@ func TestHTTP(t *testing.T) { require.NoError(t, err) defer resp.Body.Close() + buf, err := io.ReadAll(resp.Body) + require.Equal(t, "Alloy is ready.\n", string(buf)) + + require.Equal(t, http.StatusOK, resp.StatusCode) + }) + + util.Eventually(t, func(t require.TestingT) { + cli, err := config.NewClientFromConfig(config.HTTPClientConfig{}, "test") + require.NoError(t, err) + + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s/-/healthy", env.ListenAddr()), nil) + require.NoError(t, err) + + resp, err := cli.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + buf, err := io.ReadAll(resp.Body) + require.Equal(t, "All Alloy components are healthy.\n", string(buf)) + require.Equal(t, http.StatusOK, resp.StatusCode) }) } @@ -157,9 +178,53 @@ func Test_Toggle_TLS(t *testing.T) { } } +func TestUnhealthy(t *testing.T) { + ctx := componenttest.TestContext(t) + + env, err := newTestEnvironment(t) + require.NoError(t, err) + + env.components = []*component.Info{ + { + ID: component.ID{ + ModuleID: "", + LocalID: "testCompId", + }, + Label: "testCompLabel", + ComponentName: "testCompName", + Health: component.Health{ + Health: component.HealthTypeUnhealthy, + }, + }, + } + require.NoError(t, env.ApplyConfig("")) + + go func() { + require.NoError(t, env.Run(ctx)) + }() + + util.Eventually(t, func(t require.TestingT) { + cli, err := config.NewClientFromConfig(config.HTTPClientConfig{}, "test") + require.NoError(t, err) + + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s/-/healthy", env.ListenAddr()), nil) + require.NoError(t, err) + + resp, err := cli.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + buf, err := io.ReadAll(resp.Body) + require.Equal(t, "unhealthy components: testCompName\n", string(buf)) + + require.Equal(t, http.StatusInternalServerError, resp.StatusCode) + }) +} + type testEnvironment struct { - svc *Service - addr string + svc *Service + addr string + components []*component.Info } func newTestEnvironment(t *testing.T) (*testEnvironment, error) { @@ -196,12 +261,16 @@ func (env *testEnvironment) ApplyConfig(config string) error { } func (env *testEnvironment) Run(ctx context.Context) error { - return env.svc.Run(ctx, fakeHost{}) + return env.svc.Run(ctx, fakeHost{ + components: env.components, + }) } func (env *testEnvironment) ListenAddr() string { return env.addr } -type fakeHost struct{} +type fakeHost struct { + components []*component.Info +} var _ service.Host = (fakeHost{}) @@ -209,7 +278,10 @@ func (fakeHost) GetComponent(id component.ID, opts component.InfoOptions) (*comp return nil, fmt.Errorf("no such component %s", id) } -func (fakeHost) ListComponents(moduleID string, opts component.InfoOptions) ([]*component.Info, error) { +func (f fakeHost) ListComponents(moduleID string, opts component.InfoOptions) ([]*component.Info, error) { + if f.components != nil { + return f.components, nil + } if moduleID == "" { return nil, nil }