From c6980488264946d05bccbb56fd9181d829cb1988 Mon Sep 17 00:00:00 2001 From: Paulin Todev Date: Fri, 29 Nov 2024 13:06:13 +0000 Subject: [PATCH] Add an endpoint for monitoring health --- CHANGELOG.md | 2 + docs/sources/reference/_index.md | 2 +- docs/sources/reference/http/_index.md | 65 +++++++++++++++++++++ internal/service/http/http.go | 26 +++++++++ internal/service/http/http_test.go | 82 +++++++++++++++++++++++++-- 5 files changed, 171 insertions(+), 6 deletions(-) create mode 100644 docs/sources/reference/http/_index.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e0fe4427f..eff4a6a5e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ Main (unreleased) - (_Experimental_) Add a `database_observability.mysql` component to collect mysql performance data. +- Add a new `/-/healthy` endpoint which returns HTTP 500 if one or more components are unhealthy. (@ptodev) + ### Enhancements - Add second metrics sample to the support bundle to provide delta information (@dehaansa) diff --git a/docs/sources/reference/_index.md b/docs/sources/reference/_index.md index 7969d0ea58..d98ca48aa0 100644 --- a/docs/sources/reference/_index.md +++ b/docs/sources/reference/_index.md @@ -1,6 +1,6 @@ --- canonical: https://grafana.com/docs/alloy/latest/reference/ -description: The reference-level documentaiton for Grafana Aloy +description: The reference-level documentation for Grafana Alloy menuTitle: Reference title: Grafana Alloy Reference weight: 600 diff --git a/docs/sources/reference/http/_index.md b/docs/sources/reference/http/_index.md new file mode 100644 index 0000000000..b2db8ed6bd --- /dev/null +++ b/docs/sources/reference/http/_index.md @@ -0,0 +1,65 @@ +--- +canonical: https://grafana.com/docs/alloy/latest/reference/http/ +description: Learn about HTTP endpoints exposed by Grafana Alloy +title: HTTP endpoints +weight: 700 +--- + +# The {{% param "FULL_PRODUCT_NAME" %}} HTTP endpoints + +There are HTTP endpoints which are enabled by default on every instance of {{% param "FULL_PRODUCT_NAME" %}}, +regardless which components are configured. +They can be used for monitoring, health checking, and troubleshooting. + +The HTTP server which exposes them is configured via the [http block](../config-blocks/http) +and the `--server.` [command line arguments](../cli/run). +For example, if the `--server.http.listen-addr` command line argument is set to `127.0.0.1:12345`, +you can query the `127.0.0.1:12345/metrics` endpoint to see the internal metrics of {{% param "FULL_PRODUCT_NAME" %}}. + +### /metrics + +Displays the internal metrics of {{% param "FULL_PRODUCT_NAME" %}} in the Prometheus exposition format. + +### /-/ready + +A {{% param "FULL_PRODUCT_NAME" %}} instance is "ready" once it has loaded its initial configuration. +If it is ready, HTTP 200 and the message `Alloy is ready.` are returned. +Otherwise, HTTP 503 and the message `Alloy is not ready.` are returned. + +### /-/healthy + +If all components are healthy, HTTP 200 and the message "Alloy is healthy." will be returned. +Otherwise, {{% param "FULL_PRODUCT_NAME" %}} will return HTTP 500 and an error message. +You can also monitor component health through the [UI](../../troubleshoot/debug#alloy-ui). + +``` +$ curl localhost:12345/-/healthy +Alloy is healthy. +``` + +``` +$ curl localhost:12345/-/healthy +unhealthy components: math.add +``` + +### /-/reload + +Reloads the {{% param "FULL_PRODUCT_NAME" %}} configuration file. Returns HTTP 400 and an error message if an issue with the reload was encountered. + +``` +$ curl localhost:12345/-/reload +config reloaded +``` + +``` +$ curl localhost:12345/-/reload +error during the initial load: /Users/user1/Desktop/git.alloy:13:1: Failed to build component: loading custom component controller: custom component config not found in the registry, namespace: "math", componentName: "add" +``` + +### /-/support + +Generates a [support bundle](../../troubleshoot/support_bundle). + +### /debug/pprof + +Generates a [profile](../../troubleshoot/profile). \ No newline at end of file diff --git a/internal/service/http/http.go b/internal/service/http/http.go index 61425033ec..c8a0b57d51 100644 --- a/internal/service/http/http.go +++ b/internal/service/http/http.go @@ -188,6 +188,32 @@ func (s *Service) Run(ctx context.Context, host service.Host) error { otelmux.WithTracerProvider(s.tracer), )) + // The implementation for "/-/healthy" is inspired by + // the "/components" web API endpoint in /internal/web/api/api.go + r.HandleFunc("/-/healthy", func(w http.ResponseWriter, r *http.Request) { + components, err := host.ListComponents("", component.InfoOptions{ + GetHealth: true, + }) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + unhealthyComponents := []string{} + for _, c := range components { + if c.Health.Health == component.HealthTypeUnhealthy { + unhealthyComponents = append(unhealthyComponents, c.ComponentName) + } + } + if len(unhealthyComponents) > 0 { + http.Error(w, "unhealthy components: "+strings.Join(unhealthyComponents, ", "), http.StatusInternalServerError) + return + } + + fmt.Fprintln(w, "Alloy is healthy.") + w.WriteHeader(http.StatusOK) + }) + r.Handle( "/metrics", promhttp.HandlerFor(s.gatherer, promhttp.HandlerOpts{}), diff --git a/internal/service/http/http_test.go b/internal/service/http/http_test.go index 2481fcd6cb..4a47b77964 100644 --- a/internal/service/http/http_test.go +++ b/internal/service/http/http_test.go @@ -3,6 +3,7 @@ package http import ( "context" "fmt" + "io" "net/http" "testing" @@ -43,6 +44,26 @@ func TestHTTP(t *testing.T) { require.NoError(t, err) defer resp.Body.Close() + buf, err := io.ReadAll(resp.Body) + require.Equal(t, "Alloy is ready.\n", string(buf)) + + require.Equal(t, http.StatusOK, resp.StatusCode) + }) + + util.Eventually(t, func(t require.TestingT) { + cli, err := config.NewClientFromConfig(config.HTTPClientConfig{}, "test") + require.NoError(t, err) + + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s/-/healthy", env.ListenAddr()), nil) + require.NoError(t, err) + + resp, err := cli.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + buf, err := io.ReadAll(resp.Body) + require.Equal(t, "Alloy is healthy.\n", string(buf)) + require.Equal(t, http.StatusOK, resp.StatusCode) }) } @@ -157,9 +178,53 @@ func Test_Toggle_TLS(t *testing.T) { } } +func TestUnhealthy(t *testing.T) { + ctx := componenttest.TestContext(t) + + env, err := newTestEnvironment(t) + require.NoError(t, err) + + env.components = []*component.Info{ + { + ID: component.ID{ + ModuleID: "", + LocalID: "testCompId", + }, + Label: "testCompLabel", + ComponentName: "testCompName", + Health: component.Health{ + Health: component.HealthTypeUnhealthy, + }, + }, + } + require.NoError(t, env.ApplyConfig("")) + + go func() { + require.NoError(t, env.Run(ctx)) + }() + + util.Eventually(t, func(t require.TestingT) { + cli, err := config.NewClientFromConfig(config.HTTPClientConfig{}, "test") + require.NoError(t, err) + + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s/-/healthy", env.ListenAddr()), nil) + require.NoError(t, err) + + resp, err := cli.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + buf, err := io.ReadAll(resp.Body) + require.Equal(t, "unhealthy components: testCompName\n", string(buf)) + + require.Equal(t, http.StatusInternalServerError, resp.StatusCode) + }) +} + type testEnvironment struct { - svc *Service - addr string + svc *Service + addr string + components []*component.Info } func newTestEnvironment(t *testing.T) (*testEnvironment, error) { @@ -196,12 +261,16 @@ func (env *testEnvironment) ApplyConfig(config string) error { } func (env *testEnvironment) Run(ctx context.Context) error { - return env.svc.Run(ctx, fakeHost{}) + return env.svc.Run(ctx, fakeHost{ + components: env.components, + }) } func (env *testEnvironment) ListenAddr() string { return env.addr } -type fakeHost struct{} +type fakeHost struct { + components []*component.Info +} var _ service.Host = (fakeHost{}) @@ -209,7 +278,10 @@ func (fakeHost) GetComponent(id component.ID, opts component.InfoOptions) (*comp return nil, fmt.Errorf("no such component %s", id) } -func (fakeHost) ListComponents(moduleID string, opts component.InfoOptions) ([]*component.Info, error) { +func (f fakeHost) ListComponents(moduleID string, opts component.InfoOptions) ([]*component.Info, error) { + if f.components != nil { + return f.components, nil + } if moduleID == "" { return nil, nil }