From b72bda0a420f75ea0439cc0240dcf6d3363e5d48 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Wed, 25 Dec 2024 16:58:31 +0400 Subject: [PATCH] fix: talosctl support and race tests 1. Don't set max cgroups limit if race mode is enabled (only in test mode). When e.g. apid/trustd are built with race detector on, they consume 10x the memory. 2. Fix a data race in `talosctl support` when showing UI progress. 3. Fix an issue pulling `kubeconfig` in `talosctl support` - pull from endpoints (controlplanes) without setting any nodes. Fixes #10036 Signed-off-by: Andrey Smirnov --- .github/workflows/ci.yaml | 4 ++- .../workflows/integration-qemu-race-cron.yaml | 4 ++- .kres.yaml | 2 ++ cmd/talosctl/cmd/talos/support.go | 34 ++++++++++++------- go.mod | 2 +- go.sum | 4 +-- internal/app/machined/pkg/startup/cgroups.go | 17 ++++++++-- pkg/cluster/crashdump.go | 8 ++++- 8 files changed, 53 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ff3947b3b2..fa2cf737d8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2024-12-24T15:00:58Z by kres fcff05e. +# Generated on 2024-12-25T15:13:54Z by kres fcff05e. name: default concurrency: @@ -3367,6 +3367,8 @@ jobs: QEMU_EXTRA_DISKS: "3" QEMU_EXTRA_DISKS_DRIVERS: ide,nvme QEMU_EXTRA_DISKS_SIZE: "10240" + QEMU_MEMORY_CONTROLPLANES: "4096" + QEMU_MEMORY_WORKERS: "4096" TAG_SUFFIX: -race WITH_CONFIG_PATCH_WORKER: '@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml' run: | diff --git a/.github/workflows/integration-qemu-race-cron.yaml b/.github/workflows/integration-qemu-race-cron.yaml index 1444c5564f..a24f797184 100644 --- a/.github/workflows/integration-qemu-race-cron.yaml +++ b/.github/workflows/integration-qemu-race-cron.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2024-11-28T13:53:18Z by kres 232fe63. +# Generated on 2024-12-25T15:13:54Z by kres fcff05e. name: integration-qemu-race-cron concurrency: @@ -94,6 +94,8 @@ jobs: QEMU_EXTRA_DISKS: "3" QEMU_EXTRA_DISKS_DRIVERS: ide,nvme QEMU_EXTRA_DISKS_SIZE: "10240" + QEMU_MEMORY_CONTROLPLANES: "4096" + QEMU_MEMORY_WORKERS: "4096" TAG_SUFFIX: -race WITH_CONFIG_PATCH_WORKER: '@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml' run: | diff --git a/.kres.yaml b/.kres.yaml index 6456d9bb2b..10a602c457 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -1283,6 +1283,8 @@ spec: QEMU_EXTRA_DISKS_SIZE: "10240" QEMU_EXTRA_DISKS_DRIVERS: "ide,nvme" WITH_CONFIG_PATCH_WORKER: "@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml" + QEMU_MEMORY_CONTROLPLANES: 4096 # race-enabled Talos consumes lots of RAM + QEMU_MEMORY_WORKERS: 4096 TAG_SUFFIX: -race IMAGE_REGISTRY: registry.dev.siderolabs.io - name: save-talos-logs diff --git a/cmd/talosctl/cmd/talos/support.go b/cmd/talosctl/cmd/talos/support.go index dad44e2b6b..b412b773bb 100644 --- a/cmd/talosctl/cmd/talos/support.go +++ b/cmd/talosctl/cmd/talos/support.go @@ -12,6 +12,7 @@ import ( "io" "os" "strings" + "sync" "text/tabwriter" "github.com/cosi-project/runtime/pkg/resource" @@ -111,7 +112,7 @@ var supportCmd = &cobra.Command{ } func collectData(dest *os.File, progress chan bundle.Progress) error { - return WithClient(func(ctx context.Context, c *client.Client) error { + return WithClientNoNodes(func(ctx context.Context, c *client.Client) error { clientset, err := getKubernetesClient(ctx, c) if err != nil { fmt.Fprintf(os.Stderr, "Failed to create kubernetes client %s\n", err) @@ -142,11 +143,7 @@ func collectData(dest *os.File, progress chan bundle.Progress) error { } func getKubernetesClient(ctx context.Context, c *client.Client) (*k8s.Clientset, error) { - if len(GlobalArgs.Endpoints) == 0 { - fmt.Fprintln(os.Stderr, "No endpoints set for the cluster, the command might not be able to get kubeconfig") - } - - kubeconfig, err := c.Kubeconfig(client.WithNodes(ctx, GlobalArgs.Endpoints...)) + kubeconfig, err := c.Kubeconfig(ctx) if err != nil { return nil, err } @@ -284,6 +281,7 @@ func showProgress(progress <-chan bundle.Progress, errors *supportBundleErrors) uiprogress.Start() type nodeProgress struct { + mu sync.Mutex state string bar *uiprogress.Bar } @@ -298,29 +296,39 @@ func showProgress(progress <-chan bundle.Progress, errors *supportBundleErrors) ok bool ) - if np, ok = nodes[p.Source]; !ok { + src := p.Source + + if _, ok = nodes[p.Source]; !ok { bar := uiprogress.AddBar(p.Total) bar = bar.AppendCompleted().PrependElapsed() - src := p.Source - np = &nodeProgress{ state: "initializing...", bar: bar, } - bar.AppendFunc(func(b *uiprogress.Bar) string { - return fmt.Sprintf("%s: %s", src, np.state) - }) + bar.AppendFunc( + func(src string, np *nodeProgress) func(b *uiprogress.Bar) string { + return func(b *uiprogress.Bar) string { + np.mu.Lock() + defer np.mu.Unlock() + + return fmt.Sprintf("%s: %s", src, np.state) + } + }(src, np), + ) bar.Width = 20 nodes[src] = np } else { - np = nodes[p.Source] + np = nodes[src] } + np.mu.Lock() np.state = p.State + np.mu.Unlock() + np.bar.Incr() } diff --git a/go.mod b/go.mod index 94dde12490..5c3d89d719 100644 --- a/go.mod +++ b/go.mod @@ -157,7 +157,7 @@ require ( github.com/siderolabs/go-retry v0.3.3 github.com/siderolabs/go-smbios v0.3.3 github.com/siderolabs/go-tail v0.1.1 - github.com/siderolabs/go-talos-support v0.1.1 + github.com/siderolabs/go-talos-support v0.1.2 github.com/siderolabs/grpc-proxy v0.5.1 github.com/siderolabs/kms-client v0.1.0 github.com/siderolabs/net v0.4.0 diff --git a/go.sum b/go.sum index a22a6029e0..bef57d2001 100644 --- a/go.sum +++ b/go.sum @@ -675,8 +675,8 @@ github.com/siderolabs/go-smbios v0.3.3 h1:rM3UKHQ8in1mqNRkpV75Ls3Wnk6rAhQJVYKUsK github.com/siderolabs/go-smbios v0.3.3/go.mod h1:kScnr0XSyzLfkRo/ChjITgI0rPRQnIi6PdgbxVCwA9U= github.com/siderolabs/go-tail v0.1.1 h1:3XeJgd97OHyFAIE7nQEMcRhOfnv7DvXbu0BRKbtT6u8= github.com/siderolabs/go-tail v0.1.1/go.mod h1:IihAL39acadXHfb5fEAOKK2DaDFIrG2+VD3b2H/ziZ0= -github.com/siderolabs/go-talos-support v0.1.1 h1:g51J0WQssQAycU/0cDliC2l4uX2H02yUs2+fa5pCvHg= -github.com/siderolabs/go-talos-support v0.1.1/go.mod h1:o4woiYS+2J3djCQgyHZRVZQm8XpazQr+XPcTXAZvamo= +github.com/siderolabs/go-talos-support v0.1.2 h1:xKFwT8emzxpmamIe3W35QlmadC54OaPNO9/Y+fL7WwM= +github.com/siderolabs/go-talos-support v0.1.2/go.mod h1:o9zRfWJQhW5j3PQxs7v0jmG4igD4peDatqbAGQFe4oo= github.com/siderolabs/grpc-proxy v0.5.1 h1:WTZYLMPTZPt43BzEJ02LT9kYA9qAfquWwCezc6NPPYE= github.com/siderolabs/grpc-proxy v0.5.1/go.mod h1:EQwE87LiWxhiIUPBeWmpjJb9DIWxWID8R6ARtdTC+8A= github.com/siderolabs/kms-client v0.1.0 h1:rCDWzcDDsNlp6zdyLngOuuhchVILn+vwUQy3tk6rQps= diff --git a/internal/app/machined/pkg/startup/cgroups.go b/internal/app/machined/pkg/startup/cgroups.go index a71114fe37..cce62e5ea1 100644 --- a/internal/app/machined/pkg/startup/cgroups.go +++ b/internal/app/machined/pkg/startup/cgroups.go @@ -14,6 +14,7 @@ import ( "github.com/containerd/cgroups/v3/cgroup1" "github.com/containerd/cgroups/v3/cgroup2" "github.com/opencontainers/runtime-spec/specs-go" + "github.com/siderolabs/go-debug" "github.com/siderolabs/go-pointer" "go.uber.org/zap" @@ -22,6 +23,16 @@ import ( "github.com/siderolabs/talos/pkg/machinery/constants" ) +func zeroIfRace[T any](v T) T { + if debug.RaceEnabled { + var zeroT T + + return zeroT + } + + return v +} + // CreateSystemCgroups creates system cgroups. // //nolint:gocyclo @@ -130,7 +141,7 @@ func CreateSystemCgroups(ctx context.Context, log *zap.Logger, rt runtime.Runtim name: constants.CgroupDashboard, resources: &cgroup2.Resources{ Memory: &cgroup2.Memory{ - Max: pointer.To[int64](constants.CgroupDashboardMaxMemory), + Max: zeroIfRace(pointer.To[int64](constants.CgroupDashboardMaxMemory)), }, CPU: &cgroup2.CPU{ Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupDashboardMillicores))), @@ -143,7 +154,7 @@ func CreateSystemCgroups(ctx context.Context, log *zap.Logger, rt runtime.Runtim Memory: &cgroup2.Memory{ Min: pointer.To[int64](constants.CgroupApidReservedMemory), Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2), - Max: pointer.To[int64](constants.CgroupApidMaxMemory), + Max: zeroIfRace(pointer.To[int64](constants.CgroupApidMaxMemory)), }, CPU: &cgroup2.CPU{ Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupApidMillicores))), @@ -156,7 +167,7 @@ func CreateSystemCgroups(ctx context.Context, log *zap.Logger, rt runtime.Runtim Memory: &cgroup2.Memory{ Min: pointer.To[int64](constants.CgroupTrustdReservedMemory), Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2), - Max: pointer.To[int64](constants.CgroupTrustdMaxMemory), + Max: zeroIfRace(pointer.To[int64](constants.CgroupTrustdMaxMemory)), }, CPU: &cgroup2.CPU{ Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupTrustdMillicores))), diff --git a/pkg/cluster/crashdump.go b/pkg/cluster/crashdump.go index c343cb86d1..38027b0f1b 100644 --- a/pkg/cluster/crashdump.go +++ b/pkg/cluster/crashdump.go @@ -9,6 +9,7 @@ import ( "fmt" "io" "os" + "time" "github.com/siderolabs/gen/xslices" "github.com/siderolabs/go-talos-support/support" @@ -33,6 +34,10 @@ func Crashdump(ctx context.Context, cluster provision.Cluster, logWriter io.Writ defer supportFile.Close() //nolint:errcheck + // limit support bundle generation time + ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + c, err := client.New(ctx, client.WithDefaultConfig()) if err != nil { fmt.Fprintf(logWriter, "error creating crashdump: %s\n", err) @@ -50,7 +55,8 @@ func Crashdump(ctx context.Context, cluster provision.Cluster, logWriter io.Writ bundle.WithArchiveOutput(supportFile), bundle.WithTalosClient(c), bundle.WithNodes(nodes...), - bundle.WithNumWorkers(1), + bundle.WithNumWorkers(4), + bundle.WithLogOutput(io.Discard), } kubeclient, err := getKubernetesClient(ctx, c, controlplane)