Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Capture configuration files & runtime config in support bundle #2094

Merged
merged 11 commits into from
Nov 22, 2024
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ Main (unreleased)

- Add `otelcol.exporter.splunkhec` allowing to export otel data to Splunk HEC (@adlotsof)

### Enhancements

- Add all raw configuration files & a copy of the latest remote config to the support bundle (@dehaansa)

### Bugfixes

- Fixed an issue in the `prometheus.exporter.postgres` component that would leak goroutines when the target was not reachable (@dehaansa)
Expand Down
5 changes: 4 additions & 1 deletion docs/sources/troubleshoot/support_bundle.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,11 @@ A support bundle contains the following data:
* `alloy-runtime-flags.txt` contains the values of the runtime flags available in {{< param "PRODUCT_NAME" >}}.
* The `pprof/` directory contains Go runtime profiling data (CPU, heap, goroutine, mutex, block profiles) as exported by the pprof package.
Refer to the [profile][profile] documentation for more details on how to use this information.
* The `sources/` directory contains copies of the local configuration files being used to configure {{< param "PRODUCT_NAME" >}}.
dehaansa marked this conversation as resolved.
Show resolved Hide resolved
* `sources/remote-config/remote.alloy` contains a copy of the last received [remote configuration][remotecfg].

[profile]: ../profile
[components]: ../../get-started/components/
[alloy-repo]: https://github.com/grafana/alloy/issues
[backward-compatibility]: ../../introduction/backward-compatibility
[backward-compatibility]: ../../introduction/backward-compatibility
[remotecfg]: ../../reference/config-blocks/remotecfg/
1 change: 1 addition & 0 deletions internal/alloycli/cmd_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ func (fr *alloyRun) Run(cmd *cobra.Command, configPath string) error {
if err != nil {
return nil, fmt.Errorf("reading config path %q: %w", configPath, err)
}
httpService.SetSources(alloySource.RawConfigs())
if err := f.LoadSource(alloySource, nil, configPath); err != nil {
return alloySource, fmt.Errorf("error during the initial load: %w", err)
}
Expand Down
114 changes: 70 additions & 44 deletions internal/service/http/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ type Service struct {
// Used to enforce single-flight requests to supportHandler
supportBundleMut sync.Mutex

// Track the raw config for use with the support bundle
sources map[string][]byte

// publicLis and tcpLis are used to lazily enable TLS, since TLS is
// optionally configurable at runtime.
//
Expand Down Expand Up @@ -225,7 +228,7 @@ func (s *Service) Run(ctx context.Context, host service.Host) error {
}

// Wire in support bundle generator
r.HandleFunc("/-/support", s.supportHandler).Methods("GET")
r.HandleFunc("/-/support", s.generateSupportBundleHandler(host)).Methods("GET")

// Wire custom service handlers for services which depend on the http
// service.
Expand Down Expand Up @@ -259,60 +262,75 @@ func (s *Service) Run(ctx context.Context, host service.Host) error {
return nil
}

func (s *Service) supportHandler(rw http.ResponseWriter, r *http.Request) {
s.supportBundleMut.Lock()
defer s.supportBundleMut.Unlock()
func (s *Service) generateSupportBundleHandler(host service.Host) func(rw http.ResponseWriter, r *http.Request) {
return func(rw http.ResponseWriter, r *http.Request) {
s.supportBundleMut.Lock()
defer s.supportBundleMut.Unlock()

// TODO(dehaansa) remove this check once the support bundle is generally available
if !s.opts.MinStability.Permits(featuregate.StabilityPublicPreview) {
rw.WriteHeader(http.StatusForbidden)
_, _ = rw.Write([]byte("support bundle generation is only available in public preview. Use" +
" --stability.level command-line flag to enable public-preview features"))
return
}
// TODO(dehaansa) remove this check once the support bundle is generally available
if !s.opts.MinStability.Permits(featuregate.StabilityPublicPreview) {
rw.WriteHeader(http.StatusForbidden)
_, _ = rw.Write([]byte("support bundle generation is only available in public preview. Use" +
" --stability.level command-line flag to enable public-preview features"))
return
}

if s.opts.BundleContext.DisableSupportBundle {
rw.WriteHeader(http.StatusForbidden)
_, _ = rw.Write([]byte("support bundle generation is disabled; it can be re-enabled by removing the --disable-support-bundle flag"))
return
}
if s.opts.BundleContext.DisableSupportBundle {
rw.WriteHeader(http.StatusForbidden)
_, _ = rw.Write([]byte("support bundle generation is disabled; it can be re-enabled by removing the --disable-support-bundle flag"))
return
}

duration := getServerWriteTimeout(r)
if r.URL.Query().Has("duration") {
d, err := strconv.Atoi(r.URL.Query().Get("duration"))
if err != nil {
http.Error(rw, fmt.Sprintf("duration value (in seconds) should be a positive integer: %s", err), http.StatusBadRequest)
return
}
if d < 1 {
http.Error(rw, "duration value (in seconds) should be larger than 1", http.StatusBadRequest)
return
}
if float64(d) > duration.Seconds() {
http.Error(rw, "duration value exceeds the server's write timeout", http.StatusBadRequest)
return
}
duration = time.Duration(d) * time.Second
}
ctx, cancel := context.WithTimeout(context.Background(), duration)
defer cancel()

duration := getServerWriteTimeout(r)
if r.URL.Query().Has("duration") {
d, err := strconv.Atoi(r.URL.Query().Get("duration"))
var logsBuffer bytes.Buffer
syncBuff := log.NewSyncWriter(&logsBuffer)
s.globalLogger.SetTemporaryWriter(syncBuff)
defer func() {
s.globalLogger.RemoveTemporaryWriter()
}()

cachedConfig, err := remoteCfgCachedConfig(host)
if err != nil {
http.Error(rw, fmt.Sprintf("duration value (in seconds) should be a positive integer: %s", err), http.StatusBadRequest)
return
level.Debug(s.log).Log("msg", "failed to get cached remote config", "err", err)
}
if d < 1 {
http.Error(rw, "duration value (in seconds) should be larger than 1", http.StatusBadRequest)

bundle, err := ExportSupportBundle(ctx, s.opts.BundleContext.RuntimeFlags, s.opts.HTTPListenAddr, s.sources, cachedConfig, s.Data().(Data).DialFunc)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if float64(d) > duration.Seconds() {
http.Error(rw, "duration value exceeds the server's write timeout", http.StatusBadRequest)
if err := ServeSupportBundle(rw, bundle, &logsBuffer); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
duration = time.Duration(d) * time.Second
}
ctx, cancel := context.WithTimeout(context.Background(), duration)
defer cancel()

var logsBuffer bytes.Buffer
syncBuff := log.NewSyncWriter(&logsBuffer)
s.globalLogger.SetTemporaryWriter(syncBuff)
defer func() {
s.globalLogger.RemoveTemporaryWriter()
}()
}

bundle, err := ExportSupportBundle(ctx, s.opts.BundleContext.RuntimeFlags, s.opts.HTTPListenAddr, s.Data().(Data).DialFunc)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if err := ServeSupportBundle(rw, bundle, &logsBuffer); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
// SetSources sets the sources on reload to be delivered
// with the support bundle.
func (s *Service) SetSources(sources map[string][]byte) {
s.supportBundleMut.Lock()
defer s.supportBundleMut.Unlock()
s.sources = sources
}

func getServerWriteTimeout(r *http.Request) time.Duration {
Expand Down Expand Up @@ -582,6 +600,14 @@ func (lis *lazyListener) Addr() net.Addr {
return lis.inner.Addr()
}

func remoteCfgCachedConfig(host service.Host) ([]byte, error) {
svc, ok := host.GetService(remotecfg.ServiceName)
if !ok {
return nil, fmt.Errorf("failed to get the remotecfg service")
}
return svc.(*remotecfg.Service).GetCachedConfig()
}

func remoteCfgHostProvider(host service.Host) func() (service.Host, error) {
return func() (service.Host, error) {
svc, ok := host.GetService(remotecfg.ServiceName)
Expand Down
33 changes: 21 additions & 12 deletions internal/service/http/supportbundle.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ type Bundle struct {
components []byte
peers []byte
runtimeFlags []byte
sources map[string][]byte
remoteCfg []byte
heapBuf *bytes.Buffer
goroutineBuf *bytes.Buffer
blockBuf *bytes.Buffer
Expand All @@ -49,7 +51,7 @@ type Metadata struct {
}

// ExportSupportBundle gathers the information required for the support bundle.
func ExportSupportBundle(ctx context.Context, runtimeFlags []string, srvAddress string, dialContext server.DialContextFunc) (*Bundle, error) {
func ExportSupportBundle(ctx context.Context, runtimeFlags []string, srvAddress string, sources map[string][]byte, remoteCfg []byte, dialContext server.DialContextFunc) (*Bundle, error) {
// The block profiler is disabled by default. Temporarily enable recording
// of all blocking events. Also, temporarily record all mutex contentions,
// and defer restoring of earlier mutex profiling fraction.
Expand Down Expand Up @@ -136,6 +138,8 @@ func ExportSupportBundle(ctx context.Context, runtimeFlags []string, srvAddress
alloyMetrics: alloyMetrics,
components: components,
peers: peers,
sources: sources,
remoteCfg: remoteCfg,
runtimeFlags: []byte(strings.Join(runtimeFlags, "\n")),
heapBuf: &heapBuf,
goroutineBuf: &goroutineBuf,
Expand Down Expand Up @@ -169,17 +173,22 @@ func ServeSupportBundle(rw http.ResponseWriter, b *Bundle, logsBuf *bytes.Buffer
rw.Header().Set("Content-Disposition", "attachment; filename=\"alloy-support-bundle.zip\"")

zipStructure := map[string][]byte{
"alloy-metadata.yaml": b.meta,
"alloy-components.json": b.components,
"alloy-peers.json": b.peers,
"alloy-metrics.txt": b.alloyMetrics,
"alloy-runtime-flags.txt": b.runtimeFlags,
"alloy-logs.txt": logsBuf.Bytes(),
"pprof/cpu.pprof": b.cpuBuf.Bytes(),
"pprof/heap.pprof": b.heapBuf.Bytes(),
"pprof/goroutine.pprof": b.goroutineBuf.Bytes(),
"pprof/mutex.pprof": b.mutexBuf.Bytes(),
"pprof/block.pprof": b.blockBuf.Bytes(),
"alloy-metadata.yaml": b.meta,
"alloy-components.json": b.components,
"alloy-peers.json": b.peers,
"alloy-metrics.txt": b.alloyMetrics,
"alloy-runtime-flags.txt": b.runtimeFlags,
"alloy-logs.txt": logsBuf.Bytes(),
"sources/remote-config/remote.alloy": b.remoteCfg,
"pprof/cpu.pprof": b.cpuBuf.Bytes(),
"pprof/heap.pprof": b.heapBuf.Bytes(),
"pprof/goroutine.pprof": b.goroutineBuf.Bytes(),
"pprof/mutex.pprof": b.mutexBuf.Bytes(),
"pprof/block.pprof": b.blockBuf.Bytes(),
}

for p, s := range b.sources {
zipStructure[filepath.Join("sources", filepath.Base(p))] = s
}

for fn, b := range zipStructure {
Expand Down
4 changes: 2 additions & 2 deletions internal/service/remotecfg/remotecfg.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ func (s *Service) fetchRemote() error {
}

func (s *Service) fetchLocal() {
b, err := s.getCachedConfig()
b, err := s.GetCachedConfig()
if err != nil {
level.Error(s.opts.Logger).Log("msg", "failed to read from cache", "err", err)
return
Expand Down Expand Up @@ -440,7 +440,7 @@ func (s *Service) getAPIConfig() ([]byte, error) {
return []byte(gcr.Msg.GetContent()), nil
}

func (s *Service) getCachedConfig() ([]byte, error) {
func (s *Service) GetCachedConfig() ([]byte, error) {
s.mut.RLock()
p := s.dataPath
s.mut.RUnlock()
Expand Down
Loading