From 1f2d25d8ff94205ef853ce2dd9239483b658d7b4 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Fri, 25 Aug 2023 16:40:45 -0600 Subject: [PATCH 01/20] Add deterministic behavior for calls against a single host: By queuing requests per host we now provide deterministic behavior of calls. Signed-off-by: Jacob Weinstock --- go.sum | 10 +- grpc/rpc/bmc.go | 8 +- grpc/rpc/machine.go | 4 +- grpc/rpc/task_test.go | 2 +- grpc/server.go | 4 + grpc/taskrunner/queue.go | 44 ++++++ grpc/taskrunner/taskrunner.go | 211 +++++++++++++++++++---------- grpc/taskrunner/taskrunner_test.go | 2 +- pkg/task/task.go | 2 +- 9 files changed, 198 insertions(+), 89 deletions(-) create mode 100644 grpc/taskrunner/queue.go diff --git a/go.sum b/go.sum index 858bb36..9a77975 100644 --- a/go.sum +++ b/go.sum @@ -52,8 +52,6 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bmc-toolbox/bmclib v0.5.7 h1:v3CqOJCMUuH+kA+xi7CdY5EuzUhMH9gsBkYTQMYlbog= github.com/bmc-toolbox/bmclib v0.5.7/go.mod h1:jSCb2/o2bZhTTg3IgShckCfFxkX4yqQC065tuYh2pKk= -github.com/bmc-toolbox/bmclib/v2 v2.0.1-0.20230324092939-d39fb75b6aa9 h1:UNtiASZUNvhF7Dr2qdqQy63VjddnpxS4bH3f+SQc/yQ= -github.com/bmc-toolbox/bmclib/v2 v2.0.1-0.20230324092939-d39fb75b6aa9/go.mod h1:iRhgD8P0gvy95wYXA3FDCKbo/aRiKBaodBBgoUG/+Qg= github.com/bmc-toolbox/bmclib/v2 v2.0.1-0.20230515164712-2714c7479477 h1:2GKBUqU+hrthvhEJyvJMj473uUQ7ByufchSftLNLS8E= github.com/bmc-toolbox/bmclib/v2 v2.0.1-0.20230515164712-2714c7479477/go.mod h1:a3Ra0ce/LV3wAj7AHuphlHNTx5Sg67iQqtLGr1zoqio= github.com/bmc-toolbox/common v0.0.0-20230220061748-93ff001f4a1d h1:cQ30Wa8mhLzK1TSOG+g3FlneIsXtFgun61mmPwVPmD0= @@ -236,12 +234,8 @@ github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1: github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/jacobweinstock/iamt v0.0.0-20230304043040-a6b4a1001123 h1:Mh2eOcadGcu/6E0bq5FfaGlFYFe2oyNOeRjpgC1vOq0= -github.com/jacobweinstock/iamt v0.0.0-20230304043040-a6b4a1001123/go.mod h1:FgmiLTU6cJewV4Xgrq6m5o8CUlTQOJtqzaFLGA0mG+E= github.com/jacobweinstock/iamt v0.0.0-20230502042727-d7cdbe67d9ef h1:G4k02HGmBUfJFSNu3gfKJ+ki+B3qutKsYzYndkqqKc4= github.com/jacobweinstock/iamt v0.0.0-20230502042727-d7cdbe67d9ef/go.mod h1:FgmiLTU6cJewV4Xgrq6m5o8CUlTQOJtqzaFLGA0mG+E= -github.com/jacobweinstock/registrar v0.4.6 h1:0O3g2jT2Lx+Bf+yl4QsMUN48fVZxUpM3kS+NtIJ+ucw= -github.com/jacobweinstock/registrar v0.4.6/go.mod h1:IDx65tQ7DLJ2UqiVjE1zo74jMZZfel9YZW8VrC26m6o= github.com/jacobweinstock/registrar v0.4.7 h1:s4dOExccgD+Pc7rJC+f3Mc3D+NXHcXUaOibtcEsPxOc= github.com/jacobweinstock/registrar v0.4.7/go.mod h1:PWmkdGFG5/ZdCqgMo7pvB3pXABOLHc5l8oQ0sgmBNDU= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= @@ -344,8 +338,6 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.15.0 h1:js3yy885G8xwJa6iOISGFwd+qlUo5AvyXb7CiihdtiU= github.com/spf13/viper v1.15.0/go.mod h1:fFcTBJxvhhzSJiZy8n+PeW6t8l+KeT/uTARa0jHOQLA= -github.com/stmcginnis/gofish v0.13.1-0.20230130123602-c77017d5737a h1:FJWP5Gv6qUSa+hfkOMpWtVdpWEl/jl+QZfwRIA1mK9E= -github.com/stmcginnis/gofish v0.13.1-0.20230130123602-c77017d5737a/go.mod h1:BLDSFTp8pDlf/xDbLZa+F7f7eW0E/CHCboggsu8CznI= github.com/stmcginnis/gofish v0.14.0 h1:geECNAiG33JDB2x2xDkerpOOuXFqxp5YP3EFE3vd5iM= github.com/stmcginnis/gofish v0.14.0/go.mod h1:BLDSFTp8pDlf/xDbLZa+F7f7eW0E/CHCboggsu8CznI= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -396,7 +388,7 @@ go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= -go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= +go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= diff --git a/grpc/rpc/bmc.go b/grpc/rpc/bmc.go index 3d5fe62..1e2ce7a 100644 --- a/grpc/rpc/bmc.go +++ b/grpc/rpc/bmc.go @@ -66,7 +66,7 @@ func (b *BmcService) Reset(ctx context.Context, in *v1.ResetRequest) (*v1.ResetR defer cancel() return "", t.BMCReset(taskCtx, in.ResetKind.String()) } - b.TaskRunner.Execute(ctx, l, "bmc reset", taskID, execFunc) + b.TaskRunner.Execute(ctx, l, "bmc reset", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) return &v1.ResetResponse{TaskId: taskID}, nil } @@ -101,7 +101,7 @@ func (b *BmcService) CreateUser(ctx context.Context, in *v1.CreateUserRequest) ( defer cancel() return "", t.CreateUser(taskCtx) } - b.TaskRunner.Execute(ctx, l, "creating user", taskID, execFunc) + b.TaskRunner.Execute(ctx, l, "creating user", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) return &v1.CreateUserResponse{TaskId: taskID}, nil } @@ -136,7 +136,7 @@ func (b *BmcService) UpdateUser(ctx context.Context, in *v1.UpdateUserRequest) ( defer cancel() return "", t.UpdateUser(taskCtx) } - b.TaskRunner.Execute(ctx, l, "updating user", taskID, execFunc) + b.TaskRunner.Execute(ctx, l, "updating user", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) return &v1.UpdateUserResponse{TaskId: taskID}, nil } @@ -169,7 +169,7 @@ func (b *BmcService) DeleteUser(ctx context.Context, in *v1.DeleteUserRequest) ( defer cancel() return "", t.DeleteUser(taskCtx) } - b.TaskRunner.Execute(ctx, l, "deleting user", taskID, execFunc) + b.TaskRunner.Execute(ctx, l, "deleting user", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) return &v1.DeleteUserResponse{TaskId: taskID}, nil } diff --git a/grpc/rpc/machine.go b/grpc/rpc/machine.go index ac475ce..01007b7 100644 --- a/grpc/rpc/machine.go +++ b/grpc/rpc/machine.go @@ -54,7 +54,7 @@ func (m *MachineService) BootDevice(ctx context.Context, in *v1.DeviceRequest) ( defer cancel() return mbd.BootDeviceSet(taskCtx, in.BootDevice.String(), in.Persistent, in.EfiBoot) } - m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, execFunc) + m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) return &v1.DeviceResponse{TaskId: taskID}, nil } @@ -89,7 +89,7 @@ func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.Po defer cancel() return mp.PowerSet(taskCtx, in.PowerAction.String()) } - m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, execFunc) + m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) return &v1.PowerResponse{TaskId: taskID}, nil } diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index d138415..f8533b0 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -34,7 +34,7 @@ func TestTaskFound(t *testing.T) { Ctx: ctx, } taskID := xid.New().String() - taskRunner.Execute(ctx, logger, "test", taskID, func(s chan string) (string, error) { + taskRunner.Execute(ctx, logger, "test", taskID, "123", func(s chan string) (string, error) { return "doing cool stuff", defaultError }) diff --git a/grpc/server.go b/grpc/server.go index d512f0a..0e547cb 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -21,6 +21,7 @@ import ( "github.com/tinkerbell/pbnj/pkg/repository" "google.golang.org/grpc" "google.golang.org/grpc/health/grpc_health_v1" + "google.golang.org/grpc/reflection" ) // Server options. @@ -80,7 +81,9 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po taskRunner := &taskrunner.Runner{ Repository: defaultServer.Actions, Ctx: ctx, + Dispatcher: taskrunner.NewDispatcher(), } + go taskRunner.Start(ctx) ms := rpc.MachineService{ TaskRunner: taskRunner, @@ -114,6 +117,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } httpServer.WithTaskRunner(taskRunner) + reflection.Register(grpcServer) go func() { err := httpServer.Run() diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go new file mode 100644 index 0000000..7e70713 --- /dev/null +++ b/grpc/taskrunner/queue.go @@ -0,0 +1,44 @@ +package taskrunner + +import ( + "sync" + + "github.com/pkg/errors" +) + +type IngestQueue struct { + mu sync.Mutex + queue []Ingest +} + +type Ingest struct { + ID string + Host string + Description string + Action func(chan string) (string, error) +} + +func NewIngestQueue() *IngestQueue { + return &IngestQueue{ + queue: []Ingest{}, + } +} + +// Enqueue inserts the item into the queue +func (i *IngestQueue) Enqueue(item Ingest) { + i.mu.Lock() + defer i.mu.Unlock() + i.queue = append(i.queue, item) +} + +// Dequeue removes the oldest element from the queue. FIFO. +func (i *IngestQueue) Dequeue() (Ingest, error) { + i.mu.Lock() + defer i.mu.Unlock() + if len(i.queue) > 0 { + item := i.queue[0] + i.queue = i.queue[1:] + return item, nil + } + return Ingest{}, errors.New("queue is empty") +} diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index b14d347..1f8dd5b 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -5,11 +5,11 @@ import ( "net" "net/url" "sync" + "sync/atomic" "syscall" "time" "github.com/go-logr/logr" - "github.com/hashicorp/go-multierror" "github.com/pkg/errors" "github.com/tinkerbell/pbnj/pkg/metrics" @@ -20,58 +20,108 @@ import ( type Runner struct { Repository repository.Actions Ctx context.Context - active int - total int - counterMu sync.RWMutex + active atomic.Int32 + total atomic.Int32 + Dispatcher *dispatcher +} + +type dispatcher struct { + // IngestQueue is a queue of jobs that is process synchronously. + // It's the entry point for all jobs. + IngestQueue *IngestQueue + // perID hold a queue per ID. + // jobs across different IDs are processed concurrently. + // jobs with the same ID are processed synchronously. + perID sync.Map + goroutinePerID atomic.Int32 + idleWorkerShutdown time.Duration + log logr.Logger +} + +type taskChannel struct { + ch chan Ingest +} + +func NewDispatcher() *dispatcher { + return &dispatcher{ + IngestQueue: NewIngestQueue(), + perID: sync.Map{}, + idleWorkerShutdown: time.Second * 10, + } } // ActiveWorkers returns a count of currently active worker jobs. func (r *Runner) ActiveWorkers() int { - r.counterMu.RLock() - defer r.counterMu.RUnlock() - return r.active + return int(r.active.Load()) } // TotalWorkers returns a count total workers executed. func (r *Runner) TotalWorkers() int { - r.counterMu.RLock() - defer r.counterMu.RUnlock() - return r.total + return int(r.total.Load()) } -// Execute a task, update repository with status. -func (r *Runner) Execute(ctx context.Context, l logr.Logger, description, taskID string, action func(chan string) (string, error)) { - go r.worker(ctx, l, description, taskID, action) +func (d *Runner) Start(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + default: + elem, err := d.Dispatcher.IngestQueue.Dequeue() + if err != nil { + break + } + + ch := make(chan Ingest) + v, loaded := d.Dispatcher.perID.LoadOrStore(elem.Host, taskChannel{ch: ch}) + if !loaded { + go d.worker1(ctx, elem.Host, ch) + d.Dispatcher.goroutinePerID.Add(1) + } + v.(taskChannel).ch <- elem + } + } } -// does the work, updates the repo record -// TODO handle retrys, use a timeout. -func (r *Runner) worker(_ context.Context, logger logr.Logger, description, taskID string, action func(chan string) (string, error)) { - logger = logger.WithValues("taskID", taskID, "description", description) - r.counterMu.Lock() - r.active++ - r.total++ - r.counterMu.Unlock() +// channelWorker is a worker that listens on a channel for jobs. +// It will shutdown the worker after gc duration of no elements in the channel or the context is canceled. +// worker is in charge of its own lifecycle. +func (q *Runner) worker1(ctx context.Context, id string, ch chan Ingest) { defer func() { - r.counterMu.Lock() - r.active-- - r.counterMu.Unlock() + // do i need to delete the channel if i delete the map entry it lives in? + // close(ch) + q.Dispatcher.perID.Delete(id) + q.Dispatcher.goroutinePerID.Add(-1) }() + for { + select { + case <-ctx.Done(): + return + case t := <-ch: + // execute the task synchronously + q.worker(ctx, q.Dispatcher.log, t.Description, t.ID, t.Action) + case <-time.After(q.Dispatcher.idleWorkerShutdown): + // shutdown the worker after this duration of no elements in the channel. + return + } + } +} - metrics.TasksTotal.Inc() - metrics.TasksActive.Inc() - defer metrics.TasksActive.Dec() +// Execute a task, update repository with status. +func (r *Runner) Execute(ctx context.Context, l logr.Logger, description, taskID, host string, action func(chan string) (string, error)) { + i := Ingest{ + ID: taskID, + Host: host, + Description: description, + Action: action, + } + r.Dispatcher.log = l + r.Dispatcher.IngestQueue.Enqueue(i) +} - messagesChan := make(chan string) - actionACK := make(chan bool, 1) - actionSyn := make(chan bool, 1) - defer close(messagesChan) - defer close(actionACK) - defer close(actionSyn) - repo := r.Repository +func (r *Runner) updateMessages(ctx context.Context, taskID, desc string, ch chan string) error { sessionRecord := repository.Record{ ID: taskID, - Description: description, + Description: desc, State: "running", Messages: []string{}, Error: &repository.Error{ @@ -80,60 +130,79 @@ func (r *Runner) worker(_ context.Context, logger logr.Logger, description, task Details: nil, }, } - - err := repo.Create(taskID, sessionRecord) + err := r.Repository.Create(taskID, sessionRecord) if err != nil { - // TODO how to handle unable to create record; ie network error, persistence error, etc? - logger.Error(err, "task complete", "complete", true) - return + return err } - - go func() { - for { - select { - case msg := <-messagesChan: - currStatus, _ := repo.Get(taskID) - sessionRecord.Messages = append(currStatus.Messages, msg) //nolint:gocritic // apparently this is the right slice - _ = repo.Update(taskID, sessionRecord) - case <-actionSyn: - actionACK <- true - return - default: + for { + select { + case <-ctx.Done(): + return nil + case msg := <-ch: + record, err := r.Repository.Get(taskID) + if err != nil { + return err + } + record.Messages = append(record.Messages, msg) + if err := r.Repository.Update(taskID, record); err != nil { + return err } - time.Sleep(10 * time.Millisecond) } + } +} + +// does the work, updates the repo record +// TODO handle retrys, use a timeout. +func (r *Runner) worker(ctx context.Context, logger logr.Logger, description, taskID string, action func(chan string) (string, error)) { + logger = logger.WithValues("taskID", taskID, "description", description) + r.active.Add(1) + r.total.Add(1) + defer func() { + r.active.Add(-1) }() - sessionRecord.Result, err = action(messagesChan) - actionSyn <- true - <-actionACK - sessionRecord.State = "complete" - sessionRecord.Complete = true - var finalErr error + metrics.TasksTotal.Inc() + metrics.TasksActive.Inc() + defer metrics.TasksActive.Dec() + + messagesChan := make(chan string) + defer close(messagesChan) + go r.updateMessages(ctx, taskID, description, messagesChan) + + resultRecord := repository.Record{ + State: "complete", + Complete: true, + } + result, err := action(messagesChan) if err != nil { - finalErr = multierror.Append(finalErr, err) - sessionRecord.Result = "action failed" + resultRecord.Result = "action failed" re, ok := err.(*repository.Error) if ok { - sessionRecord.Error = re.StructuredError() + resultRecord.Error = re.StructuredError() } else { - sessionRecord.Error.Message = err.Error() + resultRecord.Error.Message = err.Error() } var foundErr *repository.Error if errors.As(err, &foundErr) { - sessionRecord.Error = foundErr.StructuredError() + resultRecord.Error = foundErr.StructuredError() } + logger.Error(err, "task completed with an error") } - // TODO handle unable to update record; ie network error, persistence error, etc - if err := repo.Update(taskID, sessionRecord); err != nil { - finalErr = multierror.Append(finalErr, err) + record, err := r.Repository.Get(taskID) + if err != nil { + return } - if finalErr != nil { - logger.Error(finalErr, "task complete", "complete", true) - } else { - logger.Info("task complete", "complete", true) + record.Complete = resultRecord.Complete + record.State = resultRecord.State + record.Result = result + record.Error = resultRecord.Error + + if err := r.Repository.Update(taskID, record); err != nil { + logger.Error(err, "failed to update record") } + + logger.Info("worker complete", "complete", true) } // Status returns the status record of a task. diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index 9b44330..c637fc3 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -32,7 +32,7 @@ func TestRoundTrip(t *testing.T) { } taskID := xid.New().String() - runner.Execute(ctx, logger, description, taskID, func(s chan string) (string, error) { + runner.Execute(ctx, logger, description, taskID, "123", func(s chan string) (string, error) { return "didnt do anything", defaultError }) diff --git a/pkg/task/task.go b/pkg/task/task.go index bcd63f3..8847b95 100644 --- a/pkg/task/task.go +++ b/pkg/task/task.go @@ -9,6 +9,6 @@ import ( // Task interface for doing BMC actions. type Task interface { - Execute(ctx context.Context, l logr.Logger, description, taskID string, action func(chan string) (string, error)) + Execute(ctx context.Context, l logr.Logger, description, taskID, host string, action func(chan string) (string, error)) Status(ctx context.Context, taskID string) (record repository.Record, err error) } From b9b43acd339ff40270a7049a82423a773d6f7e0a Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Fri, 25 Aug 2023 17:17:15 -0600 Subject: [PATCH 02/20] Update linting: Version was old. Signed-off-by: Jacob Weinstock --- .golangci.yml | 53 +++++++++++++++++++++++++++++++------------------- Makefile | 54 ++++++++++++++++++++++++++++----------------------- 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index f159c65..d3416a8 100755 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,3 +1,7 @@ +run: + # The default runtime timeout is 1m, which doesn't work well on Github Actions. + timeout: 4m + # NOTE: This file is populated by the lint-install tool. Local adjustments may be overwritten. linters-settings: cyclop: @@ -18,9 +22,19 @@ linters-settings: min-occurrences: 5 ignore-tests: true + gosec: + excludes: + - G107 # Potential HTTP request made with variable url + - G204 # Subprocess launched with function call as argument or cmd arguments + - G404 # Use of weak random number generator (math/rand instead of crypto/rand + errorlint: # these are still common in Go: for instance, exit errors. asserts: false + # Forcing %w in error wrapping forces authors to make errors part of their package APIs. The decision to make + # an error part of a package API should be a concious decision by the author. + # Also see Hyrums Law. + errorf: false exhaustive: default-signifies-exhaustive: true @@ -29,9 +43,9 @@ linters-settings: min-complexity: 8 nolintlint: - require-explanation: true - allow-unused: false - require-specific: true + require-explanation: true + allow-unused: false + require-specific: true revive: ignore-generated-header: true @@ -78,10 +92,10 @@ linters-settings: - name: waitgroup-by-value staticcheck: - go: "1.20" + go: "1.18" unused: - go: "1.20" + go: "1.18" output: sort-results: true @@ -96,8 +110,7 @@ linters: - dupl - durationcheck - errcheck - # errname is only available in golangci-lint v1.42.0+ - wait until v1.43 is available to settle - #- errname + - errname - errorlint - exhaustive - exportloopref @@ -107,6 +120,8 @@ linters: - gocritic - godot - gofmt + - gofumpt + - gosec - goheader - goimports - goprintffuncname @@ -123,11 +138,12 @@ linters: - nolintlint - predeclared # disabling for the initial iteration of the linting tool - #- promlinter + # - promlinter - revive - - rowserrcheck - - sqlclosecheck + # - rowserrcheck - disabled because of generics, https://github.com/golangci/golangci-lint/issues/2649 + # - sqlclosecheck - disabled because of generics, https://github.com/golangci/golangci-lint/issues/2649 - staticcheck + # - structcheck - disabled because of generics, https://github.com/golangci/golangci-lint/issues/2649 - stylecheck - thelper - tparallel @@ -135,7 +151,7 @@ linters: - unconvert - unparam - unused - - wastedassign + # - wastedassign - disabled because of generics, https://github.com/golangci/golangci-lint/issues/2649 - whitespace # Disabled linters, due to being misaligned with Go practices @@ -150,28 +166,25 @@ linters: # - nlreturn # - testpackage # - wsl - # Disabled linters, due to not being relevant to our code base: # - maligned # - prealloc "For most programs usage of prealloc will be a premature optimization." - # Disabled linters due to bad error messages or bugs - # - gofumpt - # - gosec # - tagliatelle - issues: # Excluding configuration per-path, per-linter, per-text and per-source exclude-rules: - path: _test\.go linters: - - gocyclo - - errcheck - dupl + - errcheck + - forcetypeassert + - gocyclo - gosec + - noctx - - path: cmd.* + - path: .*cmd.* linters: - noctx @@ -179,7 +192,7 @@ issues: linters: - noctx - - path: cmd.* + - path: .*cmd.* text: "deep-exit" - path: main\.go diff --git a/Makefile b/Makefile index c1530f3..d342952 100644 --- a/Makefile +++ b/Makefile @@ -101,14 +101,16 @@ pbs-docker-image: ## generate container image for building protocol buffers run-image: ## run PBnJ container image scripts/run-image.sh -# BEGIN: lint-install . +# BEGIN: lint-install github.com/tinkerbell/pbnj # http://github.com/tinkerbell/lint-install -GOLINT_VERSION ?= v1.52.2 -HADOLINT_VERSION ?= v2.7.0 -SHELLCHECK_VERSION ?= v0.7.2 -LINT_OS := $(shell uname) +.PHONY: lint +lint: _lint + LINT_ARCH := $(shell uname -m) +LINT_OS := $(shell uname) +LINT_OS_LOWER := $(shell echo $(LINT_OS) | tr '[:upper:]' '[:lower:]') +LINT_ROOT := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) # shellcheck and hadolint lack arm64 native binaries: rely on x86-64 emulation ifeq ($(LINT_OS),Darwin) @@ -117,30 +119,34 @@ ifeq ($(LINT_OS),Darwin) endif endif -LINT_LOWER_OS = $(shell echo $(LINT_OS) | tr '[:upper:]' '[:lower:]') -GOLINT_CONFIG:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))/.golangci.yml +LINTERS := +FIXERS := + +GOLANGCI_LINT_CONFIG := $(LINT_ROOT)/.golangci.yml +GOLANGCI_LINT_VERSION ?= v1.53.3 +GOLANGCI_LINT_BIN := $(LINT_ROOT)/out/linters/golangci-lint-$(GOLANGCI_LINT_VERSION)-$(LINT_ARCH) +$(GOLANGCI_LINT_BIN): + mkdir -p $(LINT_ROOT)/out/linters + rm -rf $(LINT_ROOT)/out/linters/golangci-lint-* + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(LINT_ROOT)/out/linters $(GOLANGCI_LINT_VERSION) + mv $(LINT_ROOT)/out/linters/golangci-lint $@ + +LINTERS += golangci-lint-lint +golangci-lint-lint: $(GOLANGCI_LINT_BIN) + find . -name go.mod -execdir "$(GOLANGCI_LINT_BIN)" run -c "$(GOLANGCI_LINT_CONFIG)" \; -lint: out/linters/shellcheck-$(SHELLCHECK_VERSION)-$(LINT_ARCH)/shellcheck out/linters/hadolint-$(HADOLINT_VERSION)-$(LINT_ARCH) out/linters/golangci-lint-$(GOLINT_VERSION)-$(LINT_ARCH) - out/linters/golangci-lint-$(GOLINT_VERSION)-$(LINT_ARCH) run - out/linters/hadolint-$(HADOLINT_VERSION)-$(LINT_ARCH) -t info $(shell find . -name "*Dockerfile") - out/linters/shellcheck-$(SHELLCHECK_VERSION)-$(LINT_ARCH)/shellcheck $(shell find . -name "*.sh") +FIXERS += golangci-lint-fix +golangci-lint-fix: $(GOLANGCI_LINT_BIN) + find . -name go.mod -execdir "$(GOLANGCI_LINT_BIN)" run -c "$(GOLANGCI_LINT_CONFIG)" --fix \; -out/linters/shellcheck-$(SHELLCHECK_VERSION)-$(LINT_ARCH)/shellcheck: - mkdir -p out/linters - curl -sSfL https://github.com/koalaman/shellcheck/releases/download/$(SHELLCHECK_VERSION)/shellcheck-$(SHELLCHECK_VERSION).$(LINT_LOWER_OS).$(LINT_ARCH).tar.xz | tar -C out/linters -xJf - - mv out/linters/shellcheck-$(SHELLCHECK_VERSION) out/linters/shellcheck-$(SHELLCHECK_VERSION)-$(LINT_ARCH) +.PHONY: _lint $(LINTERS) +_lint: $(LINTERS) -out/linters/hadolint-$(HADOLINT_VERSION)-$(LINT_ARCH): - mkdir -p out/linters - curl -sfL https://github.com/hadolint/hadolint/releases/download/v2.6.1/hadolint-$(LINT_OS)-$(LINT_ARCH) > out/linters/hadolint-$(HADOLINT_VERSION)-$(LINT_ARCH) - chmod u+x out/linters/hadolint-$(HADOLINT_VERSION)-$(LINT_ARCH) +.PHONY: fix $(FIXERS) +fix: $(FIXERS) -out/linters/golangci-lint-$(GOLINT_VERSION)-$(LINT_ARCH): - mkdir -p out/linters - curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b out/linters $(GOLINT_VERSION) - mv out/linters/golangci-lint out/linters/golangci-lint-$(GOLINT_VERSION)-$(LINT_ARCH) +# END: lint-install github.com/tinkerbell/pbnj -# END: lint-install . ##@ Clients From 10e0e6509d45368245d755d60d0fcc5e3c67fd1e Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Fri, 25 Aug 2023 17:18:09 -0600 Subject: [PATCH 03/20] Fix linting and test failures: Fixes issue where the task is not found. This was happening because I was creating the record in the DB in a goroutine and mocking the action to immediately fail. This caused the worker to complete and close before any record could be created. Signed-off-by: Jacob Weinstock --- client/client.go | 2 +- grpc/rpc/bmc.go | 20 +++++------ grpc/rpc/bmc_test.go | 3 ++ grpc/rpc/machine.go | 10 +++--- grpc/rpc/machine_test.go | 4 +++ grpc/rpc/task_test.go | 4 +++ grpc/taskrunner/queue.go | 2 +- grpc/taskrunner/taskrunner.go | 55 ++++++++++++++++-------------- grpc/taskrunner/taskrunner_test.go | 9 ++--- pkg/http/http.go | 2 +- 10 files changed, 63 insertions(+), 48 deletions(-) diff --git a/client/client.go b/client/client.go index 3f37c96..af2a457 100644 --- a/client/client.go +++ b/client/client.go @@ -118,7 +118,7 @@ func Screenshot(ctx context.Context, client v1.DiagnosticClient, request *v1.Scr filename := fmt.Sprintf("%s.%s", time.Now().String(), screenshotResponse.Filetype) - if err := os.WriteFile(filename, screenshotResponse.Image, 0755); err != nil { + if err := os.WriteFile(filename, screenshotResponse.Image, 0o755); err != nil { //nolint:gosec // Can we make this 0600? return "", err } diff --git a/grpc/rpc/bmc.go b/grpc/rpc/bmc.go index 1e2ce7a..1a74627 100644 --- a/grpc/rpc/bmc.go +++ b/grpc/rpc/bmc.go @@ -45,7 +45,7 @@ func (b *BmcService) Reset(ctx context.Context, in *v1.ResetRequest) (*v1.ResetR l.Info( "start Reset request", - "username", in.Authn.GetDirectAuthn().GetUsername(), + "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "resetKind", in.GetResetKind().String(), ) @@ -66,20 +66,20 @@ func (b *BmcService) Reset(ctx context.Context, in *v1.ResetRequest) (*v1.ResetR defer cancel() return "", t.BMCReset(taskCtx, in.ResetKind.String()) } - b.TaskRunner.Execute(ctx, l, "bmc reset", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) + b.TaskRunner.Execute(ctx, l, "bmc reset", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.ResetResponse{TaskId: taskID}, nil } // CreateUser sets the next boot device of a machine. -func (b *BmcService) CreateUser(ctx context.Context, in *v1.CreateUserRequest) (*v1.CreateUserResponse, error) { +func (b *BmcService) CreateUser(ctx context.Context, in *v1.CreateUserRequest) (*v1.CreateUserResponse, error) { //nolint:dupl // there is enough difference to not be a duplicate. l := logging.ExtractLogr(ctx) taskID := xid.New().String() l = l.WithValues("taskID", taskID) l.Info( "start CreateUser request", - "username", in.Authn.GetDirectAuthn().GetUsername(), + "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "userCreds.Username", in.UserCreds.Username, "userCreds.UserRole", in.UserCreds.UserRole, @@ -101,20 +101,20 @@ func (b *BmcService) CreateUser(ctx context.Context, in *v1.CreateUserRequest) ( defer cancel() return "", t.CreateUser(taskCtx) } - b.TaskRunner.Execute(ctx, l, "creating user", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) + b.TaskRunner.Execute(ctx, l, "creating user", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.CreateUserResponse{TaskId: taskID}, nil } // UpdateUser updates a users credentials on a BMC. -func (b *BmcService) UpdateUser(ctx context.Context, in *v1.UpdateUserRequest) (*v1.UpdateUserResponse, error) { +func (b *BmcService) UpdateUser(ctx context.Context, in *v1.UpdateUserRequest) (*v1.UpdateUserResponse, error) { //nolint:dupl // there is enough difference to not be a duplicate. l := logging.ExtractLogr(ctx) taskID := xid.New().String() l = l.WithValues("taskID", taskID) l.Info( "start UpdateUser request", - "username", in.Authn.GetDirectAuthn().GetUsername(), + "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "userCreds.Username", in.UserCreds.Username, "userCreds.UserRole", in.UserCreds.UserRole, @@ -136,7 +136,7 @@ func (b *BmcService) UpdateUser(ctx context.Context, in *v1.UpdateUserRequest) ( defer cancel() return "", t.UpdateUser(taskCtx) } - b.TaskRunner.Execute(ctx, l, "updating user", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) + b.TaskRunner.Execute(ctx, l, "updating user", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.UpdateUserResponse{TaskId: taskID}, nil } @@ -148,7 +148,7 @@ func (b *BmcService) DeleteUser(ctx context.Context, in *v1.DeleteUserRequest) ( l = l.WithValues("taskID", taskID) l.Info( "start DeleteUser request", - "username", in.Authn.GetDirectAuthn().GetUsername(), + "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "userCreds.Username", in.Username, ) @@ -169,7 +169,7 @@ func (b *BmcService) DeleteUser(ctx context.Context, in *v1.DeleteUserRequest) ( defer cancel() return "", t.DeleteUser(taskCtx) } - b.TaskRunner.Execute(ctx, l, "deleting user", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) + b.TaskRunner.Execute(ctx, l, "deleting user", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.DeleteUserResponse{TaskId: taskID}, nil } diff --git a/grpc/rpc/bmc_test.go b/grpc/rpc/bmc_test.go index 5f009c5..c1cd2cf 100644 --- a/grpc/rpc/bmc_test.go +++ b/grpc/rpc/bmc_test.go @@ -43,7 +43,9 @@ func setup() { taskRunner = &taskrunner.Runner{ Repository: repo, Ctx: ctx, + Dispatcher: taskrunner.NewDispatcher(), } + go taskRunner.Start(ctx) bmcService = BmcService{ TaskRunner: taskRunner, UnimplementedBMCServer: v1.UnimplementedBMCServer{}, @@ -155,6 +157,7 @@ func TestReset(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { + response, err := bmcService.Reset(ctx, tc.in) if err != nil { diff := cmp.Diff(tc.expectedErr.Error(), err.Error()) diff --git a/grpc/rpc/machine.go b/grpc/rpc/machine.go index 01007b7..69c1afa 100644 --- a/grpc/rpc/machine.go +++ b/grpc/rpc/machine.go @@ -31,7 +31,7 @@ func (m *MachineService) BootDevice(ctx context.Context, in *v1.DeviceRequest) ( l.Info( "start BootDevice request", - "username", in.Authn.GetDirectAuthn().GetUsername(), + "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "bootDevice", in.BootDevice.String(), "persistent", in.Persistent, @@ -54,7 +54,7 @@ func (m *MachineService) BootDevice(ctx context.Context, in *v1.DeviceRequest) ( defer cancel() return mbd.BootDeviceSet(taskCtx, in.BootDevice.String(), in.Persistent, in.EfiBoot) } - m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) + m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.DeviceResponse{TaskId: taskID}, nil } @@ -63,10 +63,10 @@ func (m *MachineService) BootDevice(ctx context.Context, in *v1.DeviceRequest) ( func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.PowerResponse, error) { l := logging.ExtractLogr(ctx) taskID := xid.New().String() - l = l.WithValues("taskID", taskID, "bmcIP", in.Authn.GetDirectAuthn().GetHost().GetHost()) + l = l.WithValues("taskID", taskID, "bmcIP", in.GetAuthn().GetDirectAuthn().GetHost().GetHost()) l.Info( "start Power request", - "username", in.Authn.GetDirectAuthn().GetUsername(), + "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "powerAction", in.GetPowerAction().String(), "softTimeout", in.SoftTimeout, @@ -89,7 +89,7 @@ func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.Po defer cancel() return mp.PowerSet(taskCtx, in.PowerAction.String()) } - m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.Authn.GetDirectAuthn().GetHost().Host, execFunc) + m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.PowerResponse{TaskId: taskID}, nil } diff --git a/grpc/rpc/machine_test.go b/grpc/rpc/machine_test.go index 0fe8fe5..55f273a 100644 --- a/grpc/rpc/machine_test.go +++ b/grpc/rpc/machine_test.go @@ -69,7 +69,9 @@ func TestDevice(t *testing.T) { taskRunner := &taskrunner.Runner{ Repository: repo, Ctx: ctx, + Dispatcher: taskrunner.NewDispatcher(), } + go taskRunner.Start(ctx) machineSvc := MachineService{ TaskRunner: taskRunner, } @@ -168,7 +170,9 @@ func TestPower(t *testing.T) { taskRunner := &taskrunner.Runner{ Repository: repo, Ctx: ctx, + Dispatcher: taskrunner.NewDispatcher(), } + go taskRunner.Start(ctx) machineSvc := MachineService{ TaskRunner: taskRunner, } diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index f8533b0..ba7c0ca 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -32,7 +32,9 @@ func TestTaskFound(t *testing.T) { taskRunner := &taskrunner.Runner{ Repository: repo, Ctx: ctx, + Dispatcher: taskrunner.NewDispatcher(), } + go taskRunner.Start(ctx) taskID := xid.New().String() taskRunner.Execute(ctx, logger, "test", taskID, "123", func(s chan string) (string, error) { return "doing cool stuff", defaultError @@ -84,7 +86,9 @@ func TestRecordNotFound(t *testing.T) { taskRunner := &taskrunner.Runner{ Repository: repo, Ctx: ctx, + Dispatcher: taskrunner.NewDispatcher(), } + go taskRunner.Start(ctx) taskSvc := TaskService{ TaskRunner: taskRunner, } diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index 7e70713..63d3d31 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -24,7 +24,7 @@ func NewIngestQueue() *IngestQueue { } } -// Enqueue inserts the item into the queue +// Enqueue inserts the item into the queue. func (i *IngestQueue) Enqueue(item Ingest) { i.mu.Lock() defer i.mu.Unlock() diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 1f8dd5b..661ba45 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -60,22 +60,22 @@ func (r *Runner) TotalWorkers() int { return int(r.total.Load()) } -func (d *Runner) Start(ctx context.Context) { +func (r *Runner) Start(ctx context.Context) { for { select { case <-ctx.Done(): return default: - elem, err := d.Dispatcher.IngestQueue.Dequeue() + elem, err := r.Dispatcher.IngestQueue.Dequeue() if err != nil { break } ch := make(chan Ingest) - v, loaded := d.Dispatcher.perID.LoadOrStore(elem.Host, taskChannel{ch: ch}) + v, loaded := r.Dispatcher.perID.LoadOrStore(elem.Host, taskChannel{ch: ch}) if !loaded { - go d.worker1(ctx, elem.Host, ch) - d.Dispatcher.goroutinePerID.Add(1) + go r.worker1(ctx, elem.Host, ch) + r.Dispatcher.goroutinePerID.Add(1) } v.(taskChannel).ch <- elem } @@ -85,12 +85,12 @@ func (d *Runner) Start(ctx context.Context) { // channelWorker is a worker that listens on a channel for jobs. // It will shutdown the worker after gc duration of no elements in the channel or the context is canceled. // worker is in charge of its own lifecycle. -func (q *Runner) worker1(ctx context.Context, id string, ch chan Ingest) { +func (r *Runner) worker1(ctx context.Context, id string, ch chan Ingest) { defer func() { // do i need to delete the channel if i delete the map entry it lives in? // close(ch) - q.Dispatcher.perID.Delete(id) - q.Dispatcher.goroutinePerID.Add(-1) + r.Dispatcher.perID.Delete(id) + r.Dispatcher.goroutinePerID.Add(-1) }() for { select { @@ -98,8 +98,8 @@ func (q *Runner) worker1(ctx context.Context, id string, ch chan Ingest) { return case t := <-ch: // execute the task synchronously - q.worker(ctx, q.Dispatcher.log, t.Description, t.ID, t.Action) - case <-time.After(q.Dispatcher.idleWorkerShutdown): + r.worker(ctx, r.Dispatcher.log, t.Description, t.ID, t.Action) + case <-time.After(r.Dispatcher.idleWorkerShutdown): // shutdown the worker after this duration of no elements in the channel. return } @@ -107,33 +107,21 @@ func (q *Runner) worker1(ctx context.Context, id string, ch chan Ingest) { } // Execute a task, update repository with status. -func (r *Runner) Execute(ctx context.Context, l logr.Logger, description, taskID, host string, action func(chan string) (string, error)) { +func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, host string, action func(chan string) (string, error)) { i := Ingest{ ID: taskID, Host: host, Description: description, Action: action, } + if r.Dispatcher == nil { + r.Dispatcher = NewDispatcher() + } r.Dispatcher.log = l r.Dispatcher.IngestQueue.Enqueue(i) } func (r *Runner) updateMessages(ctx context.Context, taskID, desc string, ch chan string) error { - sessionRecord := repository.Record{ - ID: taskID, - Description: desc, - State: "running", - Messages: []string{}, - Error: &repository.Error{ - Code: 0, - Message: "", - Details: nil, - }, - } - err := r.Repository.Create(taskID, sessionRecord) - if err != nil { - return err - } for { select { case <-ctx.Done(): @@ -167,6 +155,21 @@ func (r *Runner) worker(ctx context.Context, logger logr.Logger, description, ta messagesChan := make(chan string) defer close(messagesChan) + sessionRecord := repository.Record{ + ID: taskID, + Description: description, + State: "running", + Messages: []string{}, + Error: &repository.Error{ + Code: 0, + Message: "", + Details: nil, + }, + } + err := r.Repository.Create(taskID, sessionRecord) + if err != nil { + return + } go r.updateMessages(ctx, taskID, description, messagesChan) resultRecord := repository.Record{ diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index c637fc3..0c5add9 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -29,16 +29,17 @@ func TestRoundTrip(t *testing.T) { runner := Runner{ Repository: repo, Ctx: ctx, + Dispatcher: NewDispatcher(), } + go runner.Start(ctx) taskID := xid.New().String() - runner.Execute(ctx, logger, description, taskID, "123", func(s chan string) (string, error) { - return "didnt do anything", defaultError - }) - if len(taskID) != 20 { t.Fatalf("expected id of length 20, got: %v (%v)", len(taskID), taskID) } + runner.Execute(ctx, logger, description, taskID, "123", func(s chan string) (string, error) { + return "didnt do anything", defaultError + }) // must be min of 3 because we sleep 2 seconds in worker function to allow final status messages to be written time.Sleep(500 * time.Millisecond) diff --git a/pkg/http/http.go b/pkg/http/http.go index f43ea76..54007cf 100644 --- a/pkg/http/http.go +++ b/pkg/http/http.go @@ -34,7 +34,7 @@ func (h *Server) init() { } func (h *Server) Run() error { - return http.ListenAndServe(h.address, h.mux) + return http.ListenAndServe(h.address, h.mux) //nolint:gosec // TODO: add handle timeouts } func NewServer(addr string) *Server { From a14862818975a257535ba85ff06e767c064bb892 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Sun, 27 Aug 2023 21:04:18 -0600 Subject: [PATCH 04/20] Add elastic worker pool Signed-off-by: Jacob Weinstock --- go.mod | 5 ++ go.sum | 16 +++++ grpc/server.go | 3 + grpc/taskrunner/queue.go | 34 ++++------ grpc/taskrunner/taskrunner.go | 124 ++++++++++++++++++++++++---------- 5 files changed, 125 insertions(+), 57 deletions(-) diff --git a/go.mod b/go.mod index e6b5afd..14d0e2d 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/tinkerbell/pbnj go 1.20 require ( + github.com/adrianbrad/queue v1.2.1 github.com/bmc-toolbox/bmclib v0.5.7 github.com/bmc-toolbox/bmclib/v2 v2.0.1-0.20230515164712-2714c7479477 github.com/cristalhq/jwt/v3 v3.1.0 @@ -17,6 +18,8 @@ require ( github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/hashicorp/go-multierror v1.1.1 + github.com/jedib0t/go-pretty/v6 v6.4.6 + github.com/lnquy/elastic-worker-pool v0.0.1 github.com/manifoldco/promptui v0.9.0 github.com/mwitkow/go-proto-validators v0.3.2 github.com/onsi/gomega v1.27.6 @@ -66,6 +69,7 @@ require ( github.com/magiconair/properties v1.8.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.17 // indirect + github.com/mattn/go-runewidth v0.0.13 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/pelletier/go-toml/v2 v2.0.6 // indirect @@ -74,6 +78,7 @@ require ( github.com/prometheus/client_model v0.3.0 // indirect github.com/prometheus/common v0.42.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect + github.com/rivo/uniseg v0.2.0 // indirect github.com/satori/go.uuid v1.2.0 // indirect github.com/sirupsen/logrus v1.9.0 // indirect github.com/spf13/afero v1.9.3 // indirect diff --git a/go.sum b/go.sum index 9a77975..79aafec 100644 --- a/go.sum +++ b/go.sum @@ -46,6 +46,8 @@ github.com/VictorLowther/simplexml v0.0.0-20180716164440-0bff93621230 h1:t95Grn2 github.com/VictorLowther/simplexml v0.0.0-20180716164440-0bff93621230/go.mod h1:t2EzW1qybnPDQ3LR/GgeF0GOzHUXT5IVMLP2gkW1cmc= github.com/VictorLowther/soap v0.0.0-20150314151524-8e36fca84b22 h1:a0MBqYm44o0NcthLKCljZHe1mxlN6oahCQHHThnSwB4= github.com/VictorLowther/soap v0.0.0-20150314151524-8e36fca84b22/go.mod h1:/B7V22rcz4860iDqstGvia/2+IYWXf3/JdQCVd/1D2A= +github.com/adrianbrad/queue v1.2.1 h1:CEVsjFQyuR0s5Hty/HJGWBZHsJ3KMmii0kEgLeam/mk= +github.com/adrianbrad/queue v1.2.1/go.mod h1:wYiPC/3MPbyT45QHLrPR4zcqJWPePubM1oEP/xTwhUs= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -238,6 +240,8 @@ github.com/jacobweinstock/iamt v0.0.0-20230502042727-d7cdbe67d9ef h1:G4k02HGmBUf github.com/jacobweinstock/iamt v0.0.0-20230502042727-d7cdbe67d9ef/go.mod h1:FgmiLTU6cJewV4Xgrq6m5o8CUlTQOJtqzaFLGA0mG+E= github.com/jacobweinstock/registrar v0.4.7 h1:s4dOExccgD+Pc7rJC+f3Mc3D+NXHcXUaOibtcEsPxOc= github.com/jacobweinstock/registrar v0.4.7/go.mod h1:PWmkdGFG5/ZdCqgMo7pvB3pXABOLHc5l8oQ0sgmBNDU= +github.com/jedib0t/go-pretty/v6 v6.4.6 h1:v6aG9h6Uby3IusSSEjHaZNXpHFhzqMmjXcPq1Rjl9Jw= +github.com/jedib0t/go-pretty/v6 v6.4.6/go.mod h1:Ndk3ase2CkQbXLLNf5QDHoYb6J9WtVfmHZu9n8rk2xs= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= @@ -257,6 +261,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.2.1 h1:BqpAaACuzVSgi/VLzGZIobT2z4v53pjosyNd9Yv6n/w= github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY= +github.com/lnquy/elastic-worker-pool v0.0.1 h1:XmFImqsFAVkqPAb+BvAtdqn32lL/D445AlUn+r36MwI= +github.com/lnquy/elastic-worker-pool v0.0.1/go.mod h1:yw7YmwSJ1mlY7w8qLvhsXegi3gp4s19XmocsmlViyv4= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/manifoldco/promptui v0.9.0 h1:3V4HzJk1TtXW1MTZMP7mdlwbBpIinw3HztaIlYthEiA= @@ -268,6 +274,8 @@ github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27k github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU= +github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= @@ -298,6 +306,7 @@ github.com/philippgille/gokv/util v0.6.0/go.mod h1:ovoDHZ2Svr7YX972SPPJQRXbhHEy3 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/profile v1.6.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -310,6 +319,8 @@ github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= @@ -321,6 +332,7 @@ github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= @@ -351,6 +363,7 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= @@ -395,6 +408,7 @@ go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.18.1/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= goa.design/goa v2.2.5+incompatible h1:mjAtiy7ZdZIkj974hpFxCR6bL69qprfV00Veu3Vybts= goa.design/goa v2.2.5+incompatible/go.mod h1:NnzBwdNktihbNek+pPiFMQP9PPFsUt8MMPPyo9opDSo= +golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -499,6 +513,7 @@ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -546,6 +561,7 @@ golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/grpc/server.go b/grpc/server.go index 0e547cb..7a4deeb 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -143,6 +143,9 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po log.Info("ctx cancelled, shutting down PBnJ") grpcServer.GracefulStop() }() + defer func() { + log.Info("stats", "stats", taskRunner.GetStatistics(), "totalProcesses", taskRunner.Dispatcher.TotalProcessed.Load()) + }() log.Info("starting PBnJ gRPC server") return grpcServer.Serve(listen) diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index 63d3d31..8b12f8a 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -1,44 +1,36 @@ package taskrunner import ( - "sync" - - "github.com/pkg/errors" + "github.com/adrianbrad/queue" + "github.com/go-logr/logr" ) type IngestQueue struct { - mu sync.Mutex - queue []Ingest + q *queue.Blocking[*Ingest] } type Ingest struct { - ID string - Host string - Description string - Action func(chan string) (string, error) + ID string `json:"id"` + Host string `json:"host"` + Description string `json:"description"` + Action func(chan string) (string, error) `json:"-"` + Log logr.Logger `json:"-"` } func NewIngestQueue() *IngestQueue { return &IngestQueue{ - queue: []Ingest{}, + q: queue.NewBlocking([]*Ingest{}, queue.WithCapacity(10000)), } } // Enqueue inserts the item into the queue. func (i *IngestQueue) Enqueue(item Ingest) { - i.mu.Lock() - defer i.mu.Unlock() - i.queue = append(i.queue, item) + i.q.OfferWait(&item) } // Dequeue removes the oldest element from the queue. FIFO. func (i *IngestQueue) Dequeue() (Ingest, error) { - i.mu.Lock() - defer i.mu.Unlock() - if len(i.queue) > 0 { - item := i.queue[0] - i.queue = i.queue[1:] - return item, nil - } - return Ingest{}, errors.New("queue is empty") + item := i.q.GetWait() + + return *item, nil } diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 661ba45..7624b51 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -4,14 +4,18 @@ import ( "context" "net" "net/url" + "os" "sync" "sync/atomic" "syscall" "time" "github.com/go-logr/logr" + "github.com/go-logr/zerologr" "github.com/pkg/errors" + "github.com/rs/zerolog" + ewp "github.com/lnquy/elastic-worker-pool" "github.com/tinkerbell/pbnj/pkg/metrics" "github.com/tinkerbell/pbnj/pkg/repository" ) @@ -32,21 +36,25 @@ type dispatcher struct { // perID hold a queue per ID. // jobs across different IDs are processed concurrently. // jobs with the same ID are processed synchronously. - perID sync.Map - goroutinePerID atomic.Int32 - idleWorkerShutdown time.Duration - log logr.Logger + perID sync.Map + maxWorkers int32 + TotalProcessed atomic.Int32 + emptyMsg atomic.Int32 + pool *ewp.ElasticWorkerPool } -type taskChannel struct { - ch chan Ingest +func NewRunner(repo repository.Actions) *Runner { + return &Runner{ + Repository: repo, + Dispatcher: NewDispatcher(), + } } func NewDispatcher() *dispatcher { return &dispatcher{ - IngestQueue: NewIngestQueue(), - perID: sync.Map{}, - idleWorkerShutdown: time.Second * 10, + IngestQueue: NewIngestQueue(), + perID: sync.Map{}, + maxWorkers: 500, } } @@ -60,10 +68,55 @@ func (r *Runner) TotalWorkers() int { return int(r.total.Load()) } +func (r *Runner) startWorkerPool(min, max int32) *ewp.ElasticWorkerPool { + ewpConfig := ewp.Config{ + MinWorker: min, + MaxWorker: max, + PoolControlInterval: time.Second, + BufferLength: 1000, + } + myPool, _ := ewp.New(ewpConfig, nil, nil) // error is always nil + myPool.Start() + + return myPool +} + +func (r *Runner) TotalProcessed(ctx context.Context, log logr.Logger) { + ticker := time.NewTicker(1 * time.Second) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + log.Info("total processed", "total", r.Dispatcher.TotalProcessed.Load()) + } + } +} + +// defaultLogger is a zerolog logr implementation. +func defaultLogger() logr.Logger { + zl := zerolog.New(os.Stdout) + zl = zl.With().Caller().Timestamp().Logger() + zl = zl.Level(zerolog.DebugLevel) + + return zerologr.New(&zl) +} + +func (r *Runner) GetStatistics() *ewp.Statistics { + return r.Dispatcher.pool.GetStatistics() +} + func (r *Runner) Start(ctx context.Context) { + if r.Dispatcher == nil { + r.Dispatcher = NewDispatcher() + } + //go r.TotalProcessed(ctx, defaultLogger()) + myPool := r.startWorkerPool(10, 1000) + r.Dispatcher.pool = myPool for { select { case <-ctx.Done(): + myPool.Close() return default: elem, err := r.Dispatcher.IngestQueue.Dequeue() @@ -71,13 +124,16 @@ func (r *Runner) Start(ctx context.Context) { break } - ch := make(chan Ingest) - v, loaded := r.Dispatcher.perID.LoadOrStore(elem.Host, taskChannel{ch: ch}) - if !loaded { - go r.worker1(ctx, elem.Host, ch) - r.Dispatcher.goroutinePerID.Add(1) + perIDQueue := NewIngestQueue() + perIDQueue.Enqueue(elem) + v, loaded := r.Dispatcher.perID.LoadOrStore(elem.Host, perIDQueue) + if loaded { + v.(*IngestQueue).Enqueue(elem) } - v.(taskChannel).ch <- elem + myPool.Enqueue(func() { + r.worker1(ctx, elem.Host) + r.Dispatcher.TotalProcessed.Add(1) + }) } } } @@ -85,24 +141,14 @@ func (r *Runner) Start(ctx context.Context) { // channelWorker is a worker that listens on a channel for jobs. // It will shutdown the worker after gc duration of no elements in the channel or the context is canceled. // worker is in charge of its own lifecycle. -func (r *Runner) worker1(ctx context.Context, id string, ch chan Ingest) { - defer func() { - // do i need to delete the channel if i delete the map entry it lives in? - // close(ch) - r.Dispatcher.perID.Delete(id) - r.Dispatcher.goroutinePerID.Add(-1) - }() - for { - select { - case <-ctx.Done(): - return - case t := <-ch: - // execute the task synchronously - r.worker(ctx, r.Dispatcher.log, t.Description, t.ID, t.Action) - case <-time.After(r.Dispatcher.idleWorkerShutdown): - // shutdown the worker after this duration of no elements in the channel. +func (r *Runner) worker1(ctx context.Context, id string) { + elem, ok := r.Dispatcher.perID.Load(id) + if ok { + t, err := elem.(*IngestQueue).Dequeue() + if err != nil { return } + r.worker(ctx, t.Log, t.Description, t.ID, t.Action) } } @@ -113,11 +159,9 @@ func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, Host: host, Description: description, Action: action, + Log: defaultLogger(), } - if r.Dispatcher == nil { - r.Dispatcher = NewDispatcher() - } - r.Dispatcher.log = l + r.Dispatcher.IngestQueue.Enqueue(i) } @@ -170,11 +214,19 @@ func (r *Runner) worker(ctx context.Context, logger logr.Logger, description, ta if err != nil { return } - go r.updateMessages(ctx, taskID, description, messagesChan) + cctx, done := context.WithCancel(ctx) + defer done() + go r.updateMessages(cctx, taskID, description, messagesChan) + logger.Info("worker start") resultRecord := repository.Record{ State: "complete", Complete: true, + Error: &repository.Error{ + Code: 0, + Message: "", + Details: nil, + }, } result, err := action(messagesChan) if err != nil { From d4597418e67b79559c026c01c4ac17a9f10d2d7c Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Sun, 27 Aug 2023 21:14:29 -0600 Subject: [PATCH 05/20] Fix linting issues Signed-off-by: Jacob Weinstock --- grpc/rpc/bmc_test.go | 13 +++----- grpc/rpc/machine_test.go | 20 ++++-------- grpc/rpc/task_test.go | 22 ++++--------- grpc/server.go | 18 ++++------- grpc/taskrunner/taskrunner.go | 51 ++++++++---------------------- grpc/taskrunner/taskrunner_test.go | 2 +- 6 files changed, 39 insertions(+), 87 deletions(-) diff --git a/grpc/rpc/bmc_test.go b/grpc/rpc/bmc_test.go index c1cd2cf..c1df518 100644 --- a/grpc/rpc/bmc_test.go +++ b/grpc/rpc/bmc_test.go @@ -20,7 +20,7 @@ const tempIPMITool = "/tmp/ipmitool" var ( ctx context.Context - taskRunner *taskrunner.Runner + tr *taskrunner.Runner bmcService BmcService ) @@ -40,14 +40,10 @@ func setup() { Ctx: ctx, } - taskRunner = &taskrunner.Runner{ - Repository: repo, - Ctx: ctx, - Dispatcher: taskrunner.NewDispatcher(), - } - go taskRunner.Start(ctx) + tr = taskrunner.NewRunner(repo) + go tr.Start(ctx) bmcService = BmcService{ - TaskRunner: taskRunner, + TaskRunner: tr, UnimplementedBMCServer: v1.UnimplementedBMCServer{}, } _, err := exec.LookPath("ipmitool") @@ -157,7 +153,6 @@ func TestReset(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - response, err := bmcService.Reset(ctx, tc.in) if err != nil { diff := cmp.Diff(tc.expectedErr.Error(), err.Error()) diff --git a/grpc/rpc/machine_test.go b/grpc/rpc/machine_test.go index 55f273a..f10d534 100644 --- a/grpc/rpc/machine_test.go +++ b/grpc/rpc/machine_test.go @@ -66,14 +66,10 @@ func TestDevice(t *testing.T) { Ctx: ctx, } - taskRunner := &taskrunner.Runner{ - Repository: repo, - Ctx: ctx, - Dispatcher: taskrunner.NewDispatcher(), - } - go taskRunner.Start(ctx) + tr := taskrunner.NewRunner(repo) + go tr.Start(ctx) machineSvc := MachineService{ - TaskRunner: taskRunner, + TaskRunner: tr, } response, err := machineSvc.BootDevice(ctx, testCase.req) @@ -167,14 +163,10 @@ func TestPower(t *testing.T) { Ctx: ctx, } - taskRunner := &taskrunner.Runner{ - Repository: repo, - Ctx: ctx, - Dispatcher: taskrunner.NewDispatcher(), - } - go taskRunner.Start(ctx) + tr := taskrunner.NewRunner(repo) + go tr.Start(ctx) machineSvc := MachineService{ - TaskRunner: taskRunner, + TaskRunner: tr, } response, err := machineSvc.Power(ctx, testCase.req) diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index ba7c0ca..f256473 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -29,21 +29,17 @@ func TestTaskFound(t *testing.T) { s := gokv.Store(f) repo := &persistence.GoKV{Store: s, Ctx: ctx} - taskRunner := &taskrunner.Runner{ - Repository: repo, - Ctx: ctx, - Dispatcher: taskrunner.NewDispatcher(), - } - go taskRunner.Start(ctx) + tr := taskrunner.NewRunner(repo) + go tr.Start(ctx) taskID := xid.New().String() - taskRunner.Execute(ctx, logger, "test", taskID, "123", func(s chan string) (string, error) { + tr.Execute(ctx, logger, "test", taskID, "123", func(s chan string) (string, error) { return "doing cool stuff", defaultError }) taskReq := &v1.StatusRequest{TaskId: taskID} taskSvc := TaskService{ - TaskRunner: taskRunner, + TaskRunner: tr, } time.Sleep(10 * time.Millisecond) @@ -83,14 +79,10 @@ func TestRecordNotFound(t *testing.T) { s := gokv.Store(f) repo := &persistence.GoKV{Store: s, Ctx: ctx} - taskRunner := &taskrunner.Runner{ - Repository: repo, - Ctx: ctx, - Dispatcher: taskrunner.NewDispatcher(), - } - go taskRunner.Start(ctx) + tr := taskrunner.NewRunner(repo) + go tr.Start(ctx) taskSvc := TaskService{ - TaskRunner: taskRunner, + TaskRunner: tr, } response, err := taskSvc.Status(ctx, testCase.req) diff --git a/grpc/server.go b/grpc/server.go index 7a4deeb..e77d806 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -78,21 +78,17 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po opt(defaultServer) } - taskRunner := &taskrunner.Runner{ - Repository: defaultServer.Actions, - Ctx: ctx, - Dispatcher: taskrunner.NewDispatcher(), - } - go taskRunner.Start(ctx) + tr := taskrunner.NewRunner(repo) + go tr.Start(ctx) ms := rpc.MachineService{ - TaskRunner: taskRunner, + TaskRunner: tr, Timeout: defaultServer.bmcTimeout, } v1.RegisterMachineServer(grpcServer, &ms) bs := rpc.BmcService{ - TaskRunner: taskRunner, + TaskRunner: tr, Timeout: defaultServer.bmcTimeout, SkipRedfishVersions: defaultServer.skipRedfishVersions, } @@ -102,7 +98,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po v1.RegisterDiagnosticServer(grpcServer, &ds) ts := rpc.TaskService{ - TaskRunner: taskRunner, + TaskRunner: tr, } v1.RegisterTaskServer(grpcServer, &ts) @@ -116,7 +112,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po return err } - httpServer.WithTaskRunner(taskRunner) + httpServer.WithTaskRunner(tr) reflection.Register(grpcServer) go func() { @@ -144,7 +140,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po grpcServer.GracefulStop() }() defer func() { - log.Info("stats", "stats", taskRunner.GetStatistics(), "totalProcesses", taskRunner.Dispatcher.TotalProcessed.Load()) + log.Info("stats", "stats", tr.GetStatistics(), "totalProcesses", tr.Dispatcher.TotalProcessed.Load()) }() log.Info("starting PBnJ gRPC server") diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 7624b51..7b67869 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -4,16 +4,13 @@ import ( "context" "net" "net/url" - "os" "sync" "sync/atomic" "syscall" "time" "github.com/go-logr/logr" - "github.com/go-logr/zerologr" "github.com/pkg/errors" - "github.com/rs/zerolog" ewp "github.com/lnquy/elastic-worker-pool" "github.com/tinkerbell/pbnj/pkg/metrics" @@ -39,18 +36,17 @@ type dispatcher struct { perID sync.Map maxWorkers int32 TotalProcessed atomic.Int32 - emptyMsg atomic.Int32 pool *ewp.ElasticWorkerPool } func NewRunner(repo repository.Actions) *Runner { return &Runner{ Repository: repo, - Dispatcher: NewDispatcher(), + Dispatcher: newDispatcher(), } } -func NewDispatcher() *dispatcher { +func newDispatcher() *dispatcher { return &dispatcher{ IngestQueue: NewIngestQueue(), perID: sync.Map{}, @@ -81,36 +77,14 @@ func (r *Runner) startWorkerPool(min, max int32) *ewp.ElasticWorkerPool { return myPool } -func (r *Runner) TotalProcessed(ctx context.Context, log logr.Logger) { - ticker := time.NewTicker(1 * time.Second) - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - log.Info("total processed", "total", r.Dispatcher.TotalProcessed.Load()) - } - } -} - -// defaultLogger is a zerolog logr implementation. -func defaultLogger() logr.Logger { - zl := zerolog.New(os.Stdout) - zl = zl.With().Caller().Timestamp().Logger() - zl = zl.Level(zerolog.DebugLevel) - - return zerologr.New(&zl) -} - func (r *Runner) GetStatistics() *ewp.Statistics { return r.Dispatcher.pool.GetStatistics() } func (r *Runner) Start(ctx context.Context) { if r.Dispatcher == nil { - r.Dispatcher = NewDispatcher() + r.Dispatcher = newDispatcher() } - //go r.TotalProcessed(ctx, defaultLogger()) myPool := r.startWorkerPool(10, 1000) r.Dispatcher.pool = myPool for { @@ -128,9 +102,12 @@ func (r *Runner) Start(ctx context.Context) { perIDQueue.Enqueue(elem) v, loaded := r.Dispatcher.perID.LoadOrStore(elem.Host, perIDQueue) if loaded { - v.(*IngestQueue).Enqueue(elem) + b, ok := v.(*IngestQueue) + if ok { + b.Enqueue(elem) + } } - myPool.Enqueue(func() { + _ = myPool.Enqueue(func() { r.worker1(ctx, elem.Host) r.Dispatcher.TotalProcessed.Add(1) }) @@ -159,25 +136,25 @@ func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, Host: host, Description: description, Action: action, - Log: defaultLogger(), + Log: l, } r.Dispatcher.IngestQueue.Enqueue(i) } -func (r *Runner) updateMessages(ctx context.Context, taskID, desc string, ch chan string) error { +func (r *Runner) updateMessages(ctx context.Context, taskID string, ch chan string) { for { select { case <-ctx.Done(): - return nil + return case msg := <-ch: record, err := r.Repository.Get(taskID) if err != nil { - return err + return } record.Messages = append(record.Messages, msg) if err := r.Repository.Update(taskID, record); err != nil { - return err + return } } } @@ -216,7 +193,7 @@ func (r *Runner) worker(ctx context.Context, logger logr.Logger, description, ta } cctx, done := context.WithCancel(ctx) defer done() - go r.updateMessages(cctx, taskID, description, messagesChan) + go r.updateMessages(cctx, taskID, messagesChan) logger.Info("worker start") resultRecord := repository.Record{ diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index 0c5add9..f3a7ed9 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -29,7 +29,7 @@ func TestRoundTrip(t *testing.T) { runner := Runner{ Repository: repo, Ctx: ctx, - Dispatcher: NewDispatcher(), + Dispatcher: newDispatcher(), } go runner.Start(ctx) From 6d432f9e7a9316d133192cecee464bda5c589076 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Sun, 27 Aug 2023 21:17:07 -0600 Subject: [PATCH 06/20] Use NewRunner func Signed-off-by: Jacob Weinstock --- grpc/taskrunner/taskrunner_test.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index f3a7ed9..3c2919c 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -26,11 +26,7 @@ func TestRoundTrip(t *testing.T) { defer s.Close() repo := &persistence.GoKV{Store: s, Ctx: ctx} logger := logr.Discard() - runner := Runner{ - Repository: repo, - Ctx: ctx, - Dispatcher: newDispatcher(), - } + runner := NewRunner(repo) go runner.Start(ctx) taskID := xid.New().String() From f1cc9f8d5e024c5c289289afc12c340f71cdc875 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Sun, 3 Sep 2023 11:13:05 -0600 Subject: [PATCH 07/20] [WIP]: update the design: Signed-off-by: Jacob Weinstock --- go.mod | 4 +- go.sum | 11 +- grpc/taskrunner/manager.go | 121 +++++++++++++++ grpc/taskrunner/queue.go | 42 ++++- grpc/taskrunner/run.go | 281 ++++++++++++++++++++++++++++++++++ grpc/taskrunner/run_test.go | 21 +++ grpc/taskrunner/taskrunner.go | 39 +++-- 7 files changed, 493 insertions(+), 26 deletions(-) create mode 100644 grpc/taskrunner/manager.go create mode 100644 grpc/taskrunner/run.go create mode 100644 grpc/taskrunner/run_test.go diff --git a/go.mod b/go.mod index 14d0e2d..9371e03 100644 --- a/go.mod +++ b/go.mod @@ -15,10 +15,10 @@ require ( github.com/golang-jwt/jwt/v4 v4.5.0 github.com/golang/protobuf v1.5.3 github.com/google/go-cmp v0.5.9 + github.com/gosuri/uilive v0.0.4 github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/hashicorp/go-multierror v1.1.1 - github.com/jedib0t/go-pretty/v6 v6.4.6 github.com/lnquy/elastic-worker-pool v0.0.1 github.com/manifoldco/promptui v0.9.0 github.com/mwitkow/go-proto-validators v0.3.2 @@ -69,7 +69,6 @@ require ( github.com/magiconair/properties v1.8.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.17 // indirect - github.com/mattn/go-runewidth v0.0.13 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/pelletier/go-toml/v2 v2.0.6 // indirect @@ -78,7 +77,6 @@ require ( github.com/prometheus/client_model v0.3.0 // indirect github.com/prometheus/common v0.42.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect - github.com/rivo/uniseg v0.2.0 // indirect github.com/satori/go.uuid v1.2.0 // indirect github.com/sirupsen/logrus v1.9.0 // indirect github.com/spf13/afero v1.9.3 // indirect diff --git a/go.sum b/go.sum index 79aafec..cc5ce5b 100644 --- a/go.sum +++ b/go.sum @@ -215,6 +215,8 @@ github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+ github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= +github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY= +github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI= github.com/grpc-ecosystem/go-grpc-middleware v1.2.2/go.mod h1:EaizFBKfUKtMIF5iaDEhniwNedqGo9FuLFzppDr3uwI= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 h1:UH//fgunKIs4JdUbpDl1VZCDaL56wXCB/5+wF6uHfaI= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0/go.mod h1:g5qyo/la0ALbONm6Vbp88Yd8NsDy6rZz+RcrMPxvld8= @@ -240,8 +242,6 @@ github.com/jacobweinstock/iamt v0.0.0-20230502042727-d7cdbe67d9ef h1:G4k02HGmBUf github.com/jacobweinstock/iamt v0.0.0-20230502042727-d7cdbe67d9ef/go.mod h1:FgmiLTU6cJewV4Xgrq6m5o8CUlTQOJtqzaFLGA0mG+E= github.com/jacobweinstock/registrar v0.4.7 h1:s4dOExccgD+Pc7rJC+f3Mc3D+NXHcXUaOibtcEsPxOc= github.com/jacobweinstock/registrar v0.4.7/go.mod h1:PWmkdGFG5/ZdCqgMo7pvB3pXABOLHc5l8oQ0sgmBNDU= -github.com/jedib0t/go-pretty/v6 v6.4.6 h1:v6aG9h6Uby3IusSSEjHaZNXpHFhzqMmjXcPq1Rjl9Jw= -github.com/jedib0t/go-pretty/v6 v6.4.6/go.mod h1:Ndk3ase2CkQbXLLNf5QDHoYb6J9WtVfmHZu9n8rk2xs= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= @@ -274,8 +274,6 @@ github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27k github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU= -github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= @@ -306,7 +304,6 @@ github.com/philippgille/gokv/util v0.6.0/go.mod h1:ovoDHZ2Svr7YX972SPPJQRXbhHEy3 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/profile v1.6.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -319,8 +316,6 @@ github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= -github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= -github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= @@ -363,7 +358,6 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= @@ -561,7 +555,6 @@ golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/grpc/taskrunner/manager.go b/grpc/taskrunner/manager.go new file mode 100644 index 0000000..4391a3e --- /dev/null +++ b/grpc/taskrunner/manager.go @@ -0,0 +1,121 @@ +package taskrunner + +import "sync/atomic" + +type concurrencyManager struct { + // The number of goroutines that are allowed to run concurrently + max int + + // The manager channel to coordinate the number of concurrent goroutines. + managerCh chan interface{} + + // The done channel indicates when a single goroutine has finished its job. + doneCh chan bool + + // This channel indicates when all goroutines have finished their job. + allDoneCh chan bool + + // The closed channel is closed which controller should close + closed chan bool + + // The running count allows we know the number of goroutines are running + runningCount atomic.Int32 +} + +// New concurrencyManager +func New(maxGoRoutines int) *concurrencyManager { + // Initiate the manager object + c := concurrencyManager{ + max: maxGoRoutines, + managerCh: make(chan interface{}, maxGoRoutines), + doneCh: make(chan bool), + allDoneCh: make(chan bool), + closed: make(chan bool), + } + + // Fill the manager channel by placeholder values + for i := 0; i < c.max; i++ { + c.managerCh <- nil + } + + // Start the controller to collect all the jobs + go c.controller() + + return &c +} + +// Create the controller to collect all the jobs. +// When a goroutine is finished, we can release a slot for another goroutine. +func (c *concurrencyManager) controller() { + for { + // This will block until a goroutine is finished + <-c.doneCh + + // Say that another goroutine can now start + c.managerCh <- nil + + // When the closed flag is set, + // we need to close the manager if it doesn't have any running goroutine + if c.isClosed() && c.RunningCount() == 0 { + break + } + } + + // Say that all goroutines are finished, we can close the manager + c.allDoneCh <- true +} + +// Wait until a slot is available for the new goroutine. +// A goroutine have to start after this function. +func (c *concurrencyManager) Wait() { + + // Try to receive from the manager channel. When we have something, + // it means a slot is available and we can start a new goroutine. + // Otherwise, it will block until a slot is available. + <-c.managerCh + + // Increase the running count to help we know how many goroutines are running. + c.runningCount.Add(1) +} + +// Done Mark a goroutine as finished +func (c *concurrencyManager) Done() { + // Decrease the number of running count + c.runningCount.Add(-1) + c.doneCh <- true +} + +// Close the manager manually +// terminate if channel is already closed +func (c *concurrencyManager) Close() { + // terminate if channel is already closed + select { + case <-c.closed: + return + default: + close(c.closed) + } +} + +func (c *concurrencyManager) isClosed() bool { + select { + case <-c.closed: + return true + default: + return false + } +} + +// WaitAllDone Wait for all goroutines are done +func (c *concurrencyManager) WaitAllDone() { + // Close the manager automatic + c.Close() + + // This will block until allDoneCh was marked + <-c.allDoneCh +} + +// RunningCount Returns the number of goroutines which are running +func (c *concurrencyManager) RunningCount() int32 { + return c.runningCount.Load() +} diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index 8b12f8a..7fbdacb 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -6,10 +6,10 @@ import ( ) type IngestQueue struct { - q *queue.Blocking[*Ingest] + q *queue.Blocking[*Task] } -type Ingest struct { +type Task struct { ID string `json:"id"` Host string `json:"host"` Description string `json:"description"` @@ -19,18 +19,50 @@ type Ingest struct { func NewIngestQueue() *IngestQueue { return &IngestQueue{ - q: queue.NewBlocking([]*Ingest{}, queue.WithCapacity(10000)), + q: queue.NewBlocking([]*Task{}, queue.WithCapacity(10000)), } } // Enqueue inserts the item into the queue. -func (i *IngestQueue) Enqueue(item Ingest) { +func (i *IngestQueue) Enqueue(item Task) { i.q.OfferWait(&item) } // Dequeue removes the oldest element from the queue. FIFO. -func (i *IngestQueue) Dequeue() (Ingest, error) { +func (i *IngestQueue) Dequeue() (Task, error) { item := i.q.GetWait() return *item, nil } + +func (i *IngestQueue) Size() int { + return i.q.Size() +} + +func NewHostQueue() *hostQueue { + return &hostQueue{ + q: queue.NewBlocking[host]([]host{}, queue.WithCapacity(10000)), + } +} + +type host string + +type hostQueue struct { + q *queue.Blocking[host] +} + +// Enqueue inserts the item into the queue. +func (i *hostQueue) Enqueue(item host) { + i.q.OfferWait(item) +} + +// Dequeue removes the oldest element from the queue. FIFO. +func (i *hostQueue) Dequeue() (host, error) { + item := i.q.GetWait() + + return item, nil +} + +func (i *hostQueue) Size() int { + return i.q.Size() +} diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go new file mode 100644 index 0000000..f5c8ad9 --- /dev/null +++ b/grpc/taskrunner/run.go @@ -0,0 +1,281 @@ +package taskrunner + +import ( + "context" + "fmt" + "log" + "math/rand" + "os" + "runtime" + "strconv" + "strings" + "sync" + "time" + + "github.com/go-logr/logr" + "github.com/gosuri/uilive" +) + +type orchestrator struct { + workers sync.Map + manager *concurrencyManager + + fifoQueue *hostQueue + ingestionQueue *IngestQueue + // perIDQueue is a map of hostID to a channel of tasks. + perIDQueue sync.Map + + writeMu sync.Mutex +} + +func Start(ctx context.Context) *orchestrator { + o := &orchestrator{ + fifoQueue: NewHostQueue(), + ingestionQueue: NewIngestQueue(), + // perIDQueue is a map of hostID to a channel of tasks. + perIDQueue: sync.Map{}, + manager: New(12), + } + // 1. start the ingestor + // 2. start the orchestrator + go o.ingest(ctx) + go o.orchestrate(ctx) + go o.observe(ctx) + go o.load() + + return o +} + +func lenSyncMap(m *sync.Map) int { + var i int + m.Range(func(k, v interface{}) bool { + i++ + return true + }) + return i +} + +func (o *orchestrator) observe(ctx context.Context) { + writer := uilive.New() + + // start listening for updates and render + writer.Start() + defer writer.Stop() + workers := func() string { + s := []string{} + for _, h := range hostList { + v, ok := o.perIDQueue.Load(h) + if ok { + s = append(s, fmt.Sprintf("worker (%v) queue len: %v", h, v.(*IngestQueue).Size())) + } + } + return strings.Join(s, "\n") + "\n" + } + for { + select { + case <-ctx.Done(): + writer.Stop() + return + default: + one := fmt.Sprintf("running workers count: %v\n", o.manager.RunningCount()) + two := fmt.Sprintf("fifoQueue queue length: %v\n", o.fifoQueue.q.Size()) + three := fmt.Sprintf("goroutines: %v\n", runtime.NumGoroutine()) + four := fmt.Sprintf("worker map: %v\n", lenSyncMap(&o.workers)) + all := fmt.Sprintf("%v%v%v%v%v", one, two, three, four, workers()) + fmt.Fprint(writer, all) + <-time.After(time.Millisecond * 50) + } + } +} + +func (o *orchestrator) load() { + for i := 0; i < 1000; i++ { + i := i + go func() { + o.ingestionQueue.Enqueue(Task{ + ID: strconv.Itoa(i), + Host: hostList[rand.Intn(len(hostList))], + Description: "", + Action: func(chan string) (string, error) { + time.Sleep(time.Millisecond * 250) + return "", nil + }, + Log: logr.Discard(), + }) + }() + } +} + +// ingest take a task off the fcfs queue and put it on the perID queue +// and adds the host ID to the fcfs queue. +func (o *orchestrator) ingest(ctx context.Context) { + // dequeue from ingestion queue + // enqueue to perID queue + // enqueue to fcfs queue + for { + select { + case <-ctx.Done(): + return + default: + // 1. dequeue from ingestion queue + // 2. enqueue to perID queue + // 3. enqueue to fcfs queue + // --- + // 1. dequeue from ingestion queue + t, err := o.ingestionQueue.Dequeue() + if err != nil { + break + } + + // 2. enqueue to perID queue + // hostCh := make(chan Task) + que := NewIngestQueue() + q, _ := o.perIDQueue.LoadOrStore(t.Host, que) + q.(*IngestQueue).Enqueue(t) + + // 3. enqueue to fcfs queue + o.fifoQueue.Enqueue(host(t.Host)) + } + } +} + +// 1. dequeue from fcfs queue +// 2. dequeue from perID queue +// 3. if worker id exists, send task to worker. else continue. +// 4. if maxWorkers is reached, wait for available worker. else create worker and send task to worker. +func (o *orchestrator) orchestrate(ctx context.Context) { + // 1. dequeue from fcfs queue + // 2. start workers + for { + select { + case <-ctx.Done(): + return + default: + // 1. dequeue from fcfs queue + // 2. if worker id exists in o.workers, then move on + // 2a. if len(o.workers) < o.maxWorkers then create worker and move on, else recursion. + h, err := o.fifoQueue.Dequeue() + if err != nil { + continue + } + + // check queue length for perID queue, if 0, then continue + if elem, ok := o.perIDQueue.Load(string(h)); ok { + if elem.(*IngestQueue).Size() == 0 { + continue + } + } + + // if worker id exists in o.workers, then move on because the worker is already running. + _, ok := o.workers.Load(h) + if ok { + continue + } + + // wait for a worker to become available + o.manager.Wait() + + o.workers.Store(h, struct{}{}) + go o.worker(ctx, string(h)) + } + } +} + +func (o *orchestrator) worker(ctx context.Context, hostID string) { + defer o.manager.Done() + defer func() { + o.workers.Range(func(key, value interface{}) bool { + if key.(host) == host(hostID) { + o.workers.Delete(key) + return true + } + return true + }) + }() + elem, ok := o.perIDQueue.Load(hostID) + if !ok { + return + } + + tChan := make(chan Task) + cctx, stop := context.WithCancel(ctx) + defer stop() + + go func() { + for { + select { + case <-cctx.Done(): + return + default: + task, err := elem.(*IngestQueue).Dequeue() + if err != nil { + continue + } + tChan <- task + } + } + }() + + for { + select { + case <-ctx.Done(): + stop() + return + case t := <-tChan: + msgCh := make(chan string) + t.Action(msgCh) + case <-time.After(2 * time.Second): + stop() + o.write(fmt.Sprintf("%v: worker stopped\n", hostID)) + return + } + } +} + +func (o *orchestrator) write(s string) { + o.writeMu.Lock() + defer o.writeMu.Unlock() + f, err := os.OpenFile("/home/tink/repos/tinkerbell/pbnj/text.log", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + log.Println(err) + } + defer f.Close() + if _, err := f.WriteString(s); err != nil { + log.Println(err) + } +} + +var hostList = []string{ + "127.0.0.1", + "127.0.0.2", + "127.0.0.3", + "127.0.0.4", + "127.0.0.5", + "127.0.0.6", + "127.0.0.7", + "127.0.0.8", + "127.0.0.9", + "127.0.0.10", + "127.0.0.11", + "127.0.0.12", + "127.0.0.13", + "127.0.0.14", + "127.0.0.15", + "127.0.0.16", + "127.0.0.17", + "127.0.0.18", + "127.0.0.19", + "127.0.0.20", + "127.0.0.21", + "127.0.0.22", + "127.0.0.23", + "127.0.0.24", + "127.0.0.25", + "127.0.0.26", + "127.0.0.27", + "127.0.0.28", + "127.0.0.29", + "127.0.0.30", + "127.0.0.31", + "127.0.0.32", +} diff --git a/grpc/taskrunner/run_test.go b/grpc/taskrunner/run_test.go new file mode 100644 index 0000000..1ef1b5b --- /dev/null +++ b/grpc/taskrunner/run_test.go @@ -0,0 +1,21 @@ +package taskrunner + +import ( + "context" + "runtime" + "testing" + "time" +) + +func TestStart(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + o := Start(ctx) + + o.manager.WaitAllDone() + + + time.Sleep(3 * time.Second) + t.Log("=======", runtime.NumGoroutine()) +} diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 7b67869..7c17c72 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -81,11 +81,11 @@ func (r *Runner) GetStatistics() *ewp.Statistics { return r.Dispatcher.pool.GetStatistics() } -func (r *Runner) Start(ctx context.Context) { +func (r *Runner) old(ctx context.Context) { if r.Dispatcher == nil { r.Dispatcher = newDispatcher() } - myPool := r.startWorkerPool(10, 1000) + myPool := r.startWorkerPool(10, 10) r.Dispatcher.pool = myPool for { select { @@ -115,23 +115,44 @@ func (r *Runner) Start(ctx context.Context) { } } +func (r *Runner) Start(ctx context.Context) { + o := &orchestrator{ + workers: sync.Map{}, + fifoQueue: NewHostQueue(), + ingestionQueue: NewIngestQueue(), + perIDQueue: sync.Map{}, + } + // 1. start the ingestor + // 2. start the orchestrator + go o.ingest(ctx) + go o.orchestrate(ctx) +} + // channelWorker is a worker that listens on a channel for jobs. // It will shutdown the worker after gc duration of no elements in the channel or the context is canceled. // worker is in charge of its own lifecycle. func (r *Runner) worker1(ctx context.Context, id string) { - elem, ok := r.Dispatcher.perID.Load(id) - if ok { - t, err := elem.(*IngestQueue).Dequeue() - if err != nil { + for { + select { + case <-ctx.Done(): return + default: + elem, ok := r.Dispatcher.perID.Load(id) + if ok { + t, err := elem.(*IngestQueue).Dequeue() + if err != nil { + return + } + r.process(ctx, t.Log, t.Description, t.ID, t.Action) + } } - r.worker(ctx, t.Log, t.Description, t.ID, t.Action) } + } // Execute a task, update repository with status. func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, host string, action func(chan string) (string, error)) { - i := Ingest{ + i := Task{ ID: taskID, Host: host, Description: description, @@ -162,7 +183,7 @@ func (r *Runner) updateMessages(ctx context.Context, taskID string, ch chan stri // does the work, updates the repo record // TODO handle retrys, use a timeout. -func (r *Runner) worker(ctx context.Context, logger logr.Logger, description, taskID string, action func(chan string) (string, error)) { +func (r *Runner) process(ctx context.Context, logger logr.Logger, description, taskID string, action func(chan string) (string, error)) { logger = logger.WithValues("taskID", taskID, "description", description) r.active.Add(1) r.total.Add(1) From bf1fe485a87147fa09040d2b909468e7329a7c34 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Mon, 4 Sep 2023 11:25:34 -0600 Subject: [PATCH 08/20] Fix orphaned goroutine in worker: The goroutine (in worker) dequeuing tasks was a blocking call. This meant that after there were no tasks in the queue, this dequeue call would sit waiting. This meant that when the rest of the worker exited this goroutine did not and was orphaned. Workers are per host id so depending on unique hosts that requested tasks this would grown and stay at that number. The resolution was to make the dequeue call non-blocking. Signed-off-by: Jacob Weinstock --- grpc/server.go | 3 -- grpc/taskrunner/queue.go | 14 ++++++- grpc/taskrunner/run.go | 34 +++++----------- grpc/taskrunner/run_test.go | 7 +--- grpc/taskrunner/taskrunner.go | 75 ++++++----------------------------- 5 files changed, 36 insertions(+), 97 deletions(-) diff --git a/grpc/server.go b/grpc/server.go index e77d806..35658e4 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -139,9 +139,6 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po log.Info("ctx cancelled, shutting down PBnJ") grpcServer.GracefulStop() }() - defer func() { - log.Info("stats", "stats", tr.GetStatistics(), "totalProcesses", tr.Dispatcher.TotalProcessed.Load()) - }() log.Info("starting PBnJ gRPC server") return grpcServer.Serve(listen) diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index 7fbdacb..37e91cf 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -30,7 +30,10 @@ func (i *IngestQueue) Enqueue(item Task) { // Dequeue removes the oldest element from the queue. FIFO. func (i *IngestQueue) Dequeue() (Task, error) { - item := i.q.GetWait() + item, err := i.q.Get() + if err != nil { + return Task{}, err + } return *item, nil } @@ -47,6 +50,10 @@ func NewHostQueue() *hostQueue { type host string +func (h host) String() string { + return string(h) +} + type hostQueue struct { q *queue.Blocking[host] } @@ -58,7 +65,10 @@ func (i *hostQueue) Enqueue(item host) { // Dequeue removes the oldest element from the queue. FIFO. func (i *hostQueue) Dequeue() (host, error) { - item := i.q.GetWait() + item, err := i.q.Get() + if err != nil { + return "", err + } return item, nil } diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index f5c8ad9..1b6307b 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -3,9 +3,7 @@ package taskrunner import ( "context" "fmt" - "log" "math/rand" - "os" "runtime" "strconv" "strings" @@ -24,8 +22,6 @@ type orchestrator struct { ingestionQueue *IngestQueue // perIDQueue is a map of hostID to a channel of tasks. perIDQueue sync.Map - - writeMu sync.Mutex } func Start(ctx context.Context) *orchestrator { @@ -34,7 +30,7 @@ func Start(ctx context.Context) *orchestrator { ingestionQueue: NewIngestQueue(), // perIDQueue is a map of hostID to a channel of tasks. perIDQueue: sync.Map{}, - manager: New(12), + manager: New(17), } // 1. start the ingestor // 2. start the orchestrator @@ -57,6 +53,7 @@ func lenSyncMap(m *sync.Map) int { func (o *orchestrator) observe(ctx context.Context) { writer := uilive.New() + //writer.Out = fileWriter() // start listening for updates and render writer.Start() @@ -74,7 +71,7 @@ func (o *orchestrator) observe(ctx context.Context) { for { select { case <-ctx.Done(): - writer.Stop() + writer.Flush() return default: one := fmt.Sprintf("running workers count: %v\n", o.manager.RunningCount()) @@ -106,7 +103,7 @@ func (o *orchestrator) load() { } } -// ingest take a task off the fcfs queue and put it on the perID queue +// ingest take a task off the ingestion queue and puts it on the perID queue // and adds the host ID to the fcfs queue. func (o *orchestrator) ingest(ctx context.Context) { // dequeue from ingestion queue @@ -124,7 +121,7 @@ func (o *orchestrator) ingest(ctx context.Context) { // 1. dequeue from ingestion queue t, err := o.ingestionQueue.Dequeue() if err != nil { - break + continue } // 2. enqueue to perID queue @@ -160,7 +157,7 @@ func (o *orchestrator) orchestrate(ctx context.Context) { } // check queue length for perID queue, if 0, then continue - if elem, ok := o.perIDQueue.Load(string(h)); ok { + if elem, ok := o.perIDQueue.Load(h.String()); ok { if elem.(*IngestQueue).Size() == 0 { continue } @@ -175,8 +172,8 @@ func (o *orchestrator) orchestrate(ctx context.Context) { // wait for a worker to become available o.manager.Wait() - o.workers.Store(h, struct{}{}) - go o.worker(ctx, string(h)) + o.workers.Store(h, true) + go o.worker(ctx, h.String()) } } } @@ -192,6 +189,7 @@ func (o *orchestrator) worker(ctx context.Context, hostID string) { return true }) }() + elem, ok := o.perIDQueue.Load(hostID) if !ok { return @@ -200,6 +198,7 @@ func (o *orchestrator) worker(ctx context.Context, hostID string) { tChan := make(chan Task) cctx, stop := context.WithCancel(ctx) defer stop() + defer close(tChan) go func() { for { @@ -226,23 +225,10 @@ func (o *orchestrator) worker(ctx context.Context, hostID string) { t.Action(msgCh) case <-time.After(2 * time.Second): stop() - o.write(fmt.Sprintf("%v: worker stopped\n", hostID)) return } } -} -func (o *orchestrator) write(s string) { - o.writeMu.Lock() - defer o.writeMu.Unlock() - f, err := os.OpenFile("/home/tink/repos/tinkerbell/pbnj/text.log", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - log.Println(err) - } - defer f.Close() - if _, err := f.WriteString(s); err != nil { - log.Println(err) - } } var hostList = []string{ diff --git a/grpc/taskrunner/run_test.go b/grpc/taskrunner/run_test.go index 1ef1b5b..7a1d792 100644 --- a/grpc/taskrunner/run_test.go +++ b/grpc/taskrunner/run_test.go @@ -2,9 +2,7 @@ package taskrunner import ( "context" - "runtime" "testing" - "time" ) func TestStart(t *testing.T) { @@ -14,8 +12,5 @@ func TestStart(t *testing.T) { o := Start(ctx) o.manager.WaitAllDone() - - - time.Sleep(3 * time.Second) - t.Log("=======", runtime.NumGoroutine()) + cancel() } diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 7c17c72..8a61d91 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -7,23 +7,22 @@ import ( "sync" "sync/atomic" "syscall" - "time" "github.com/go-logr/logr" "github.com/pkg/errors" - ewp "github.com/lnquy/elastic-worker-pool" "github.com/tinkerbell/pbnj/pkg/metrics" "github.com/tinkerbell/pbnj/pkg/repository" ) // Runner for executing a task. type Runner struct { - Repository repository.Actions - Ctx context.Context - active atomic.Int32 - total atomic.Int32 - Dispatcher *dispatcher + Repository repository.Actions + Ctx context.Context + active atomic.Int32 + total atomic.Int32 + Dispatcher *dispatcher + orchestrator *orchestrator } type dispatcher struct { @@ -36,7 +35,6 @@ type dispatcher struct { perID sync.Map maxWorkers int32 TotalProcessed atomic.Int32 - pool *ewp.ElasticWorkerPool } func NewRunner(repo repository.Actions) *Runner { @@ -64,68 +62,20 @@ func (r *Runner) TotalWorkers() int { return int(r.total.Load()) } -func (r *Runner) startWorkerPool(min, max int32) *ewp.ElasticWorkerPool { - ewpConfig := ewp.Config{ - MinWorker: min, - MaxWorker: max, - PoolControlInterval: time.Second, - BufferLength: 1000, - } - myPool, _ := ewp.New(ewpConfig, nil, nil) // error is always nil - myPool.Start() - - return myPool -} - -func (r *Runner) GetStatistics() *ewp.Statistics { - return r.Dispatcher.pool.GetStatistics() -} - -func (r *Runner) old(ctx context.Context) { - if r.Dispatcher == nil { - r.Dispatcher = newDispatcher() - } - myPool := r.startWorkerPool(10, 10) - r.Dispatcher.pool = myPool - for { - select { - case <-ctx.Done(): - myPool.Close() - return - default: - elem, err := r.Dispatcher.IngestQueue.Dequeue() - if err != nil { - break - } - - perIDQueue := NewIngestQueue() - perIDQueue.Enqueue(elem) - v, loaded := r.Dispatcher.perID.LoadOrStore(elem.Host, perIDQueue) - if loaded { - b, ok := v.(*IngestQueue) - if ok { - b.Enqueue(elem) - } - } - _ = myPool.Enqueue(func() { - r.worker1(ctx, elem.Host) - r.Dispatcher.TotalProcessed.Add(1) - }) - } - } -} - func (r *Runner) Start(ctx context.Context) { o := &orchestrator{ - workers: sync.Map{}, fifoQueue: NewHostQueue(), ingestionQueue: NewIngestQueue(), - perIDQueue: sync.Map{}, + // perIDQueue is a map of hostID to a channel of tasks. + perIDQueue: sync.Map{}, + manager: New(395), } + r.orchestrator = o // 1. start the ingestor // 2. start the orchestrator go o.ingest(ctx) go o.orchestrate(ctx) + // go o.observe(ctx) } // channelWorker is a worker that listens on a channel for jobs. @@ -160,7 +110,8 @@ func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, Log: l, } - r.Dispatcher.IngestQueue.Enqueue(i) + r.orchestrator.ingestionQueue.Enqueue(i) + //r.Dispatcher.IngestQueue.Enqueue(i) } func (r *Runner) updateMessages(ctx context.Context, taskID string, ch chan string) { From 17f982a81ccd5fe1010af079daccb796f509105e Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Mon, 4 Sep 2023 22:38:07 -0600 Subject: [PATCH 09/20] Clean up: Fix tests, updating for linting issues, etc Signed-off-by: Jacob Weinstock --- grpc/rpc/machine_test.go | 3 + grpc/rpc/task_test.go | 3 +- grpc/taskrunner/manager.go | 13 ++- grpc/taskrunner/queue.go | 2 +- grpc/taskrunner/run.go | 168 +++++------------------------ grpc/taskrunner/run_test.go | 114 +++++++++++++++++++- grpc/taskrunner/taskrunner.go | 67 ++---------- grpc/taskrunner/taskrunner_test.go | 1 + 8 files changed, 162 insertions(+), 209 deletions(-) diff --git a/grpc/rpc/machine_test.go b/grpc/rpc/machine_test.go index f10d534..0add5cf 100644 --- a/grpc/rpc/machine_test.go +++ b/grpc/rpc/machine_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/onsi/gomega" @@ -71,6 +72,7 @@ func TestDevice(t *testing.T) { machineSvc := MachineService{ TaskRunner: tr, } + time.Sleep(time.Millisecond * 100) response, err := machineSvc.BootDevice(ctx, testCase.req) t.Log("Got : ", response) @@ -165,6 +167,7 @@ func TestPower(t *testing.T) { tr := taskrunner.NewRunner(repo) go tr.Start(ctx) + time.Sleep(time.Millisecond * 100) machineSvc := MachineService{ TaskRunner: tr, } diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index f256473..f0720e7 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -31,6 +31,7 @@ func TestTaskFound(t *testing.T) { tr := taskrunner.NewRunner(repo) go tr.Start(ctx) + time.Sleep(200 * time.Millisecond) taskID := xid.New().String() tr.Execute(ctx, logger, "test", taskID, "123", func(s chan string) (string, error) { return "doing cool stuff", defaultError @@ -42,7 +43,7 @@ func TestTaskFound(t *testing.T) { TaskRunner: tr, } - time.Sleep(10 * time.Millisecond) + time.Sleep(50 * time.Millisecond) taskResp, err := taskSvc.Status(ctx, taskReq) if err != nil { t.Fatal(err) diff --git a/grpc/taskrunner/manager.go b/grpc/taskrunner/manager.go index 4391a3e..77e725c 100644 --- a/grpc/taskrunner/manager.go +++ b/grpc/taskrunner/manager.go @@ -22,8 +22,8 @@ type concurrencyManager struct { runningCount atomic.Int32 } -// New concurrencyManager -func New(maxGoRoutines int) *concurrencyManager { +// newManager concurrencyManager. +func newManager(maxGoRoutines int) *concurrencyManager { // Initiate the manager object c := concurrencyManager{ max: maxGoRoutines, @@ -68,7 +68,6 @@ func (c *concurrencyManager) controller() { // Wait until a slot is available for the new goroutine. // A goroutine have to start after this function. func (c *concurrencyManager) Wait() { - // Try to receive from the manager channel. When we have something, // it means a slot is available and we can start a new goroutine. // Otherwise, it will block until a slot is available. @@ -78,7 +77,7 @@ func (c *concurrencyManager) Wait() { c.runningCount.Add(1) } -// Done Mark a goroutine as finished +// Done Mark a goroutine as finished. func (c *concurrencyManager) Done() { // Decrease the number of running count c.runningCount.Add(-1) @@ -86,7 +85,7 @@ func (c *concurrencyManager) Done() { } // Close the manager manually -// terminate if channel is already closed +// terminate if channel is already closed. func (c *concurrencyManager) Close() { // terminate if channel is already closed select { @@ -106,7 +105,7 @@ func (c *concurrencyManager) isClosed() bool { } } -// WaitAllDone Wait for all goroutines are done +// WaitAllDone Wait for all goroutines are done. func (c *concurrencyManager) WaitAllDone() { // Close the manager automatic c.Close() @@ -115,7 +114,7 @@ func (c *concurrencyManager) WaitAllDone() { <-c.allDoneCh } -// RunningCount Returns the number of goroutines which are running +// RunningCount Returns the number of goroutines which are running. func (c *concurrencyManager) RunningCount() int32 { return c.runningCount.Load() } diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index 37e91cf..69d4375 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -42,7 +42,7 @@ func (i *IngestQueue) Size() int { return i.q.Size() } -func NewHostQueue() *hostQueue { +func newHostQueue() *hostQueue { return &hostQueue{ q: queue.NewBlocking[host]([]host{}, queue.WithCapacity(10000)), } diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index 1b6307b..46d1c19 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -2,16 +2,8 @@ package taskrunner import ( "context" - "fmt" - "math/rand" - "runtime" - "strconv" - "strings" "sync" "time" - - "github.com/go-logr/logr" - "github.com/gosuri/uilive" ) type orchestrator struct { @@ -24,85 +16,6 @@ type orchestrator struct { perIDQueue sync.Map } -func Start(ctx context.Context) *orchestrator { - o := &orchestrator{ - fifoQueue: NewHostQueue(), - ingestionQueue: NewIngestQueue(), - // perIDQueue is a map of hostID to a channel of tasks. - perIDQueue: sync.Map{}, - manager: New(17), - } - // 1. start the ingestor - // 2. start the orchestrator - go o.ingest(ctx) - go o.orchestrate(ctx) - go o.observe(ctx) - go o.load() - - return o -} - -func lenSyncMap(m *sync.Map) int { - var i int - m.Range(func(k, v interface{}) bool { - i++ - return true - }) - return i -} - -func (o *orchestrator) observe(ctx context.Context) { - writer := uilive.New() - //writer.Out = fileWriter() - - // start listening for updates and render - writer.Start() - defer writer.Stop() - workers := func() string { - s := []string{} - for _, h := range hostList { - v, ok := o.perIDQueue.Load(h) - if ok { - s = append(s, fmt.Sprintf("worker (%v) queue len: %v", h, v.(*IngestQueue).Size())) - } - } - return strings.Join(s, "\n") + "\n" - } - for { - select { - case <-ctx.Done(): - writer.Flush() - return - default: - one := fmt.Sprintf("running workers count: %v\n", o.manager.RunningCount()) - two := fmt.Sprintf("fifoQueue queue length: %v\n", o.fifoQueue.q.Size()) - three := fmt.Sprintf("goroutines: %v\n", runtime.NumGoroutine()) - four := fmt.Sprintf("worker map: %v\n", lenSyncMap(&o.workers)) - all := fmt.Sprintf("%v%v%v%v%v", one, two, three, four, workers()) - fmt.Fprint(writer, all) - <-time.After(time.Millisecond * 50) - } - } -} - -func (o *orchestrator) load() { - for i := 0; i < 1000; i++ { - i := i - go func() { - o.ingestionQueue.Enqueue(Task{ - ID: strconv.Itoa(i), - Host: hostList[rand.Intn(len(hostList))], - Description: "", - Action: func(chan string) (string, error) { - time.Sleep(time.Millisecond * 250) - return "", nil - }, - Log: logr.Discard(), - }) - }() - } -} - // ingest take a task off the ingestion queue and puts it on the perID queue // and adds the host ID to the fcfs queue. func (o *orchestrator) ingest(ctx context.Context) { @@ -128,7 +41,11 @@ func (o *orchestrator) ingest(ctx context.Context) { // hostCh := make(chan Task) que := NewIngestQueue() q, _ := o.perIDQueue.LoadOrStore(t.Host, que) - q.(*IngestQueue).Enqueue(t) + v, ok := q.(*IngestQueue) + if !ok { + continue + } + v.Enqueue(t) // 3. enqueue to fcfs queue o.fifoQueue.Enqueue(host(t.Host)) @@ -140,7 +57,7 @@ func (o *orchestrator) ingest(ctx context.Context) { // 2. dequeue from perID queue // 3. if worker id exists, send task to worker. else continue. // 4. if maxWorkers is reached, wait for available worker. else create worker and send task to worker. -func (o *orchestrator) orchestrate(ctx context.Context) { +func (r *Runner) orchestrate(ctx context.Context) { // 1. dequeue from fcfs queue // 2. start workers for { @@ -151,46 +68,50 @@ func (o *orchestrator) orchestrate(ctx context.Context) { // 1. dequeue from fcfs queue // 2. if worker id exists in o.workers, then move on // 2a. if len(o.workers) < o.maxWorkers then create worker and move on, else recursion. - h, err := o.fifoQueue.Dequeue() + h, err := r.orchestrator.fifoQueue.Dequeue() if err != nil { continue } // check queue length for perID queue, if 0, then continue - if elem, ok := o.perIDQueue.Load(h.String()); ok { - if elem.(*IngestQueue).Size() == 0 { + if elem, ok := r.orchestrator.perIDQueue.Load(h.String()); ok { + v, ok := elem.(*IngestQueue) + if !ok { + continue + } + if v.Size() == 0 { continue } } // if worker id exists in o.workers, then move on because the worker is already running. - _, ok := o.workers.Load(h) + _, ok := r.orchestrator.workers.Load(h) if ok { continue } // wait for a worker to become available - o.manager.Wait() + r.orchestrator.manager.Wait() - o.workers.Store(h, true) - go o.worker(ctx, h.String()) + r.orchestrator.workers.Store(h, true) + go r.worker(ctx, h.String()) } } } -func (o *orchestrator) worker(ctx context.Context, hostID string) { - defer o.manager.Done() +func (r *Runner) worker(ctx context.Context, hostID string) { + defer r.orchestrator.manager.Done() defer func() { - o.workers.Range(func(key, value interface{}) bool { - if key.(host) == host(hostID) { - o.workers.Delete(key) - return true + r.orchestrator.workers.Range(func(key, value interface{}) bool { + if key.(host) == host(hostID) { //nolint:forcetypeassert // good + r.orchestrator.workers.Delete(key) + return true //nolint:revive // this is needed to satisfy the func parameter } - return true + return true //nolint:revive // this is needed to satisfy the func parameter }) }() - elem, ok := o.perIDQueue.Load(hostID) + elem, ok := r.orchestrator.perIDQueue.Load(hostID) if !ok { return } @@ -221,47 +142,10 @@ func (o *orchestrator) worker(ctx context.Context, hostID string) { stop() return case t := <-tChan: - msgCh := make(chan string) - t.Action(msgCh) + r.process(ctx, t.Log, t.Description, t.ID, t.Action) case <-time.After(2 * time.Second): stop() return } } - -} - -var hostList = []string{ - "127.0.0.1", - "127.0.0.2", - "127.0.0.3", - "127.0.0.4", - "127.0.0.5", - "127.0.0.6", - "127.0.0.7", - "127.0.0.8", - "127.0.0.9", - "127.0.0.10", - "127.0.0.11", - "127.0.0.12", - "127.0.0.13", - "127.0.0.14", - "127.0.0.15", - "127.0.0.16", - "127.0.0.17", - "127.0.0.18", - "127.0.0.19", - "127.0.0.20", - "127.0.0.21", - "127.0.0.22", - "127.0.0.23", - "127.0.0.24", - "127.0.0.25", - "127.0.0.26", - "127.0.0.27", - "127.0.0.28", - "127.0.0.29", - "127.0.0.30", - "127.0.0.31", - "127.0.0.32", } diff --git a/grpc/taskrunner/run_test.go b/grpc/taskrunner/run_test.go index 7a1d792..9ef2289 100644 --- a/grpc/taskrunner/run_test.go +++ b/grpc/taskrunner/run_test.go @@ -3,14 +3,124 @@ package taskrunner import ( "context" "testing" + + "github.com/philippgille/gokv" + "github.com/philippgille/gokv/freecache" + "github.com/tinkerbell/pbnj/grpc/persistence" ) func TestStart(t *testing.T) { + t.Skip("skipping test") + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - o := Start(ctx) + f := freecache.NewStore(freecache.DefaultOptions) + s := gokv.Store(f) + repo := &persistence.GoKV{Store: s, Ctx: ctx} + + r := NewRunner(repo) + r.Start(ctx) - o.manager.WaitAllDone() + r.orchestrator.manager.WaitAllDone() cancel() + t.Fatal() +} + +/* +func lenSyncMap(m *sync.Map) int { + var i int + m.Range(func(k, v interface{}) bool { + i++ + return true + }) + return i +} + +func (o *orchestrator) observe(ctx context.Context) { + writer := uilive.New() + //writer.Out = fileWriter() + + // start listening for updates and render + writer.Start() + defer writer.Stop() + workers := func() string { + s := []string{} + for _, h := range hostList { + v, ok := o.perIDQueue.Load(h) + if ok { + s = append(s, fmt.Sprintf("worker (%v) queue len: %v", h, v.(*IngestQueue).Size())) + } + } + return strings.Join(s, "\n") + "\n" + } + for { + select { + case <-ctx.Done(): + writer.Flush() + return + default: + one := fmt.Sprintf("running workers count: %v\n", o.manager.RunningCount()) + two := fmt.Sprintf("fifoQueue queue length: %v\n", o.fifoQueue.q.Size()) + three := fmt.Sprintf("goroutines: %v\n", runtime.NumGoroutine()) + four := fmt.Sprintf("worker map: %v\n", lenSyncMap(&o.workers)) + all := fmt.Sprintf("%v%v%v%v%v", one, two, three, four, workers()) + fmt.Fprint(writer, all) + <-time.After(time.Millisecond * 50) + } + } +} + +func (o *orchestrator) load() { + for i := 0; i < 1000; i++ { + i := i + go func() { + o.ingestionQueue.Enqueue(Task{ + ID: strconv.Itoa(i), + Host: hostList[rand.Intn(len(hostList))], + Description: "", + Action: func(chan string) (string, error) { + time.Sleep(time.Millisecond * 250) + return "", nil + }, + Log: logr.Discard(), + }) + }() + } +} + +var hostList = []string{ + "127.0.0.1", + "127.0.0.2", + "127.0.0.3", + "127.0.0.4", + "127.0.0.5", + "127.0.0.6", + "127.0.0.7", + "127.0.0.8", + "127.0.0.9", + "127.0.0.10", + "127.0.0.11", + "127.0.0.12", + "127.0.0.13", + "127.0.0.14", + "127.0.0.15", + "127.0.0.16", + "127.0.0.17", + "127.0.0.18", + "127.0.0.19", + "127.0.0.20", + "127.0.0.21", + "127.0.0.22", + "127.0.0.23", + "127.0.0.24", + "127.0.0.25", + "127.0.0.26", + "127.0.0.27", + "127.0.0.28", + "127.0.0.29", + "127.0.0.30", + "127.0.0.31", + "127.0.0.32", } +*/ diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 8a61d91..805399e 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -21,34 +21,21 @@ type Runner struct { Ctx context.Context active atomic.Int32 total atomic.Int32 - Dispatcher *dispatcher orchestrator *orchestrator } -type dispatcher struct { - // IngestQueue is a queue of jobs that is process synchronously. - // It's the entry point for all jobs. - IngestQueue *IngestQueue - // perID hold a queue per ID. - // jobs across different IDs are processed concurrently. - // jobs with the same ID are processed synchronously. - perID sync.Map - maxWorkers int32 - TotalProcessed atomic.Int32 -} - func NewRunner(repo repository.Actions) *Runner { - return &Runner{ - Repository: repo, - Dispatcher: newDispatcher(), + o := &orchestrator{ + fifoQueue: newHostQueue(), + ingestionQueue: NewIngestQueue(), + // perIDQueue is a map of hostID to a channel of tasks. + perIDQueue: sync.Map{}, + manager: newManager(395), } -} -func newDispatcher() *dispatcher { - return &dispatcher{ - IngestQueue: NewIngestQueue(), - perID: sync.Map{}, - maxWorkers: 500, + return &Runner{ + Repository: repo, + orchestrator: o, } } @@ -63,41 +50,10 @@ func (r *Runner) TotalWorkers() int { } func (r *Runner) Start(ctx context.Context) { - o := &orchestrator{ - fifoQueue: NewHostQueue(), - ingestionQueue: NewIngestQueue(), - // perIDQueue is a map of hostID to a channel of tasks. - perIDQueue: sync.Map{}, - manager: New(395), - } - r.orchestrator = o // 1. start the ingestor // 2. start the orchestrator - go o.ingest(ctx) - go o.orchestrate(ctx) - // go o.observe(ctx) -} - -// channelWorker is a worker that listens on a channel for jobs. -// It will shutdown the worker after gc duration of no elements in the channel or the context is canceled. -// worker is in charge of its own lifecycle. -func (r *Runner) worker1(ctx context.Context, id string) { - for { - select { - case <-ctx.Done(): - return - default: - elem, ok := r.Dispatcher.perID.Load(id) - if ok { - t, err := elem.(*IngestQueue).Dequeue() - if err != nil { - return - } - r.process(ctx, t.Log, t.Description, t.ID, t.Action) - } - } - } - + go r.orchestrator.ingest(ctx) + go r.orchestrate(ctx) } // Execute a task, update repository with status. @@ -111,7 +67,6 @@ func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, } r.orchestrator.ingestionQueue.Enqueue(i) - //r.Dispatcher.IngestQueue.Enqueue(i) } func (r *Runner) updateMessages(ctx context.Context, taskID string, ch chan string) { diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index 3c2919c..ccf666d 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -28,6 +28,7 @@ func TestRoundTrip(t *testing.T) { logger := logr.Discard() runner := NewRunner(repo) go runner.Start(ctx) + time.Sleep(time.Millisecond * 100) taskID := xid.New().String() if len(taskID) != 20 { From cff1d6c1d0255b14357b82cd9ceef6663fc79ede Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Tue, 5 Sep 2023 14:41:39 -0600 Subject: [PATCH 10/20] Extend time waiting for task service to be ready. Signed-off-by: Jacob Weinstock --- grpc/rpc/task_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index f0720e7..2744ba8 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -43,7 +43,7 @@ func TestTaskFound(t *testing.T) { TaskRunner: tr, } - time.Sleep(50 * time.Millisecond) + time.Sleep(150 * time.Millisecond) taskResp, err := taskSvc.Status(ctx, taskReq) if err != nil { t.Fatal(err) From fb4feade856c125ede2a11c0018b762041219039 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Tue, 5 Sep 2023 15:35:51 -0600 Subject: [PATCH 11/20] Fix tests Signed-off-by: Jacob Weinstock --- grpc/rpc/bmc_test.go | 27 +++++++---- grpc/rpc/machine_test.go | 37 +-------------- grpc/rpc/task_test.go | 73 +++++++++++------------------- grpc/server.go | 2 +- grpc/taskrunner/taskrunner.go | 6 +-- grpc/taskrunner/taskrunner_test.go | 2 +- 6 files changed, 49 insertions(+), 98 deletions(-) diff --git a/grpc/rpc/bmc_test.go b/grpc/rpc/bmc_test.go index c1df518..71b9053 100644 --- a/grpc/rpc/bmc_test.go +++ b/grpc/rpc/bmc_test.go @@ -19,9 +19,10 @@ import ( const tempIPMITool = "/tmp/ipmitool" var ( - ctx context.Context - tr *taskrunner.Runner - bmcService BmcService + tr *taskrunner.Runner + bmcService BmcService + taskService TaskService + machineService MachineService ) func TestMain(m *testing.M) { @@ -32,7 +33,7 @@ func TestMain(m *testing.M) { } func setup() { - ctx = context.Background() + ctx := context.Background() f := freecache.NewStore(freecache.DefaultOptions) s := gokv.Store(f) repo := &persistence.GoKV{ @@ -41,11 +42,17 @@ func setup() { } tr = taskrunner.NewRunner(repo) - go tr.Start(ctx) + tr.Start(ctx) bmcService = BmcService{ TaskRunner: tr, UnimplementedBMCServer: v1.UnimplementedBMCServer{}, } + taskService = TaskService{ + TaskRunner: tr, + } + machineService = MachineService{ + TaskRunner: tr, + } _, err := exec.LookPath("ipmitool") if err != nil { err := os.WriteFile(tempIPMITool, []byte{}, 0o777) @@ -96,7 +103,7 @@ func TestConfigNetworkSource(t *testing.T) { for _, tc := range testCases { testCase := tc t.Run(testCase.name, func(t *testing.T) { - response, err := bmcService.NetworkSource(ctx, testCase.req) + response, err := bmcService.NetworkSource(context.Background(), testCase.req) if response != nil { t.Fatalf("response should be nil, got: %v", response) } @@ -153,7 +160,7 @@ func TestReset(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - response, err := bmcService.Reset(ctx, tc.in) + response, err := bmcService.Reset(context.Background(), tc.in) if err != nil { diff := cmp.Diff(tc.expectedErr.Error(), err.Error()) if diff != "" { @@ -216,7 +223,7 @@ func TestCreateUser(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - response, err := bmcService.CreateUser(ctx, tc.in) + response, err := bmcService.CreateUser(context.Background(), tc.in) if err != nil { diff := cmp.Diff(tc.expectedErr.Error(), err.Error()) if diff != "" { @@ -279,7 +286,7 @@ func TestUpdateUser(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - response, err := bmcService.UpdateUser(ctx, tc.in) + response, err := bmcService.UpdateUser(context.Background(), tc.in) if err != nil { diff := cmp.Diff(tc.expectedErr.Error(), err.Error()) if diff != "" { @@ -338,7 +345,7 @@ func TestDeleteUser(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - response, err := bmcService.DeleteUser(ctx, tc.in) + response, err := bmcService.DeleteUser(context.Background(), tc.in) if err != nil { diff := cmp.Diff(tc.expectedErr.Error(), err.Error()) if diff != "" { diff --git a/grpc/rpc/machine_test.go b/grpc/rpc/machine_test.go index 0add5cf..3c4104d 100644 --- a/grpc/rpc/machine_test.go +++ b/grpc/rpc/machine_test.go @@ -4,15 +4,10 @@ import ( "context" "errors" "testing" - "time" "github.com/google/go-cmp/cmp" "github.com/onsi/gomega" - "github.com/philippgille/gokv" - "github.com/philippgille/gokv/freecache" v1 "github.com/tinkerbell/pbnj/api/v1" - "github.com/tinkerbell/pbnj/grpc/persistence" - "github.com/tinkerbell/pbnj/grpc/taskrunner" ) func TestDevice(t *testing.T) { @@ -59,21 +54,7 @@ func TestDevice(t *testing.T) { g := gomega.NewGomegaWithT(t) ctx := context.Background() - - f := freecache.NewStore(freecache.DefaultOptions) - s := gokv.Store(f) - repo := &persistence.GoKV{ - Store: s, - Ctx: ctx, - } - - tr := taskrunner.NewRunner(repo) - go tr.Start(ctx) - machineSvc := MachineService{ - TaskRunner: tr, - } - time.Sleep(time.Millisecond * 100) - response, err := machineSvc.BootDevice(ctx, testCase.req) + response, err := machineService.BootDevice(ctx, testCase.req) t.Log("Got : ", response) if err != nil { @@ -157,21 +138,7 @@ func TestPower(t *testing.T) { g := gomega.NewGomegaWithT(t) ctx := context.Background() - - f := freecache.NewStore(freecache.DefaultOptions) - s := gokv.Store(f) - repo := &persistence.GoKV{ - Store: s, - Ctx: ctx, - } - - tr := taskrunner.NewRunner(repo) - go tr.Start(ctx) - time.Sleep(time.Millisecond * 100) - machineSvc := MachineService{ - TaskRunner: tr, - } - response, err := machineSvc.Power(ctx, testCase.req) + response, err := machineService.Power(ctx, testCase.req) t.Log("Got response: ", response) t.Log("Got err: ", err) diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index 2744ba8..46f11ed 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -5,50 +5,39 @@ import ( "testing" "time" - "github.com/go-logr/logr" "github.com/onsi/gomega" - "github.com/philippgille/gokv" - "github.com/philippgille/gokv/freecache" - "github.com/rs/xid" v1 "github.com/tinkerbell/pbnj/api/v1" - "github.com/tinkerbell/pbnj/grpc/persistence" - "github.com/tinkerbell/pbnj/grpc/taskrunner" - "github.com/tinkerbell/pbnj/pkg/repository" ) func TestTaskFound(t *testing.T) { - // create a task - ctx := context.Background() - defaultError := &repository.Error{ - Code: 0, - Message: "", - Details: nil, - } - logger := logr.Discard() - f := freecache.NewStore(freecache.DefaultOptions) - s := gokv.Store(f) - repo := &persistence.GoKV{Store: s, Ctx: ctx} - - tr := taskrunner.NewRunner(repo) - go tr.Start(ctx) - time.Sleep(200 * time.Millisecond) - taskID := xid.New().String() - tr.Execute(ctx, logger, "test", taskID, "123", func(s chan string) (string, error) { - return "doing cool stuff", defaultError - }) - - taskReq := &v1.StatusRequest{TaskId: taskID} - - taskSvc := TaskService{ - TaskRunner: tr, + pr := &v1.PowerRequest{ + Authn: &v1.Authn{ + Authn: &v1.Authn_DirectAuthn{ + DirectAuthn: &v1.DirectAuthn{ + Host: &v1.Host{ + Host: "10.1.1.1", + }, + Username: "admin", + Password: "admin", + }, + }, + }, + Vendor: &v1.Vendor{ + Name: "", + }, + PowerAction: v1.PowerAction_POWER_ACTION_STATUS, + SoftTimeout: 0, + OffDuration: 0, } - - time.Sleep(150 * time.Millisecond) - taskResp, err := taskSvc.Status(ctx, taskReq) + resp, err := machineService.Power(context.Background(), pr) if err != nil { - t.Fatal(err) + t.Fatalf("expected no error, got: %v", err) } - if taskResp.Id != taskID { + + time.Sleep(time.Second) + taskReq := &v1.StatusRequest{TaskId: resp.TaskId} + taskResp, _ := taskService.Status(context.Background(), taskReq) + if taskResp.Id != resp.TaskId { t.Fatalf("got: %+v", taskResp) } } @@ -75,17 +64,7 @@ func TestRecordNotFound(t *testing.T) { g := gomega.NewGomegaWithT(t) ctx := context.Background() - - f := freecache.NewStore(freecache.DefaultOptions) - s := gokv.Store(f) - repo := &persistence.GoKV{Store: s, Ctx: ctx} - - tr := taskrunner.NewRunner(repo) - go tr.Start(ctx) - taskSvc := TaskService{ - TaskRunner: tr, - } - response, err := taskSvc.Status(ctx, testCase.req) + response, err := taskService.Status(ctx, testCase.req) t.Log("Got response: ", response) t.Log("Got err: ", err) diff --git a/grpc/server.go b/grpc/server.go index 35658e4..d48134c 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -79,7 +79,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } tr := taskrunner.NewRunner(repo) - go tr.Start(ctx) + tr.Start(ctx) ms := rpc.MachineService{ TaskRunner: tr, diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 805399e..bf909dc 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -17,8 +17,8 @@ import ( // Runner for executing a task. type Runner struct { - Repository repository.Actions - Ctx context.Context + Repository repository.Actions + //Ctx context.Context active atomic.Int32 total atomic.Int32 orchestrator *orchestrator @@ -50,8 +50,6 @@ func (r *Runner) TotalWorkers() int { } func (r *Runner) Start(ctx context.Context) { - // 1. start the ingestor - // 2. start the orchestrator go r.orchestrator.ingest(ctx) go r.orchestrate(ctx) } diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index ccf666d..066459e 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -27,7 +27,7 @@ func TestRoundTrip(t *testing.T) { repo := &persistence.GoKV{Store: s, Ctx: ctx} logger := logr.Discard() runner := NewRunner(repo) - go runner.Start(ctx) + runner.Start(ctx) time.Sleep(time.Millisecond * 100) taskID := xid.New().String() From 056be608d769e10f2021ff0748476bdf1320ffa9 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Tue, 5 Sep 2023 15:36:10 -0600 Subject: [PATCH 12/20] Remove unused code Signed-off-by: Jacob Weinstock --- grpc/taskrunner/taskrunner.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index bf909dc..babb0cf 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -17,8 +17,7 @@ import ( // Runner for executing a task. type Runner struct { - Repository repository.Actions - //Ctx context.Context + Repository repository.Actions active atomic.Int32 total atomic.Int32 orchestrator *orchestrator From b90c13865aa664c0051334e1f9c6b94d3347488f Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Tue, 5 Sep 2023 16:36:01 -0600 Subject: [PATCH 13/20] Make maxWorkers and workerIdleTimeout configurable: Plumb these through to the cli flags. Also fixed a channel close so that when workers are done the goruntimes drops back down to original levels. Signed-off-by: Jacob Weinstock --- cmd/server.go | 13 +++++++++++++ grpc/rpc/bmc_test.go | 3 ++- grpc/server.go | 24 +++++++++++++++++++++--- grpc/taskrunner/run.go | 13 +++++++------ grpc/taskrunner/run_test.go | 3 ++- grpc/taskrunner/taskrunner.go | 6 ++++-- grpc/taskrunner/taskrunner_test.go | 2 +- 7 files changed, 50 insertions(+), 14 deletions(-) diff --git a/cmd/server.go b/cmd/server.go index 3a3d006..9092973 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -48,6 +48,10 @@ var ( // // for more information see https://github.com/bmc-toolbox/bmclib#bmc-connections skipRedfishVersions string + // maxWorkers is the maximum number of concurrent workers that will be allowed to handle bmc tasks. + maxWorkers int + // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. + workerIdleTimeout time.Duration // serverCmd represents the server command. serverCmd = &cobra.Command{ Use: "server", @@ -98,6 +102,13 @@ var ( opts = append(opts, grpcsvr.WithSkipRedfishVersions(versions)) } + if maxWorkers > 0 { + opts = append(opts, grpcsvr.WithMaxWorkers(maxWorkers)) + } + if workerIdleTimeout > 0 { + opts = append(opts, grpcsvr.WithWorkerIdleTimeout(workerIdleTimeout)) + } + if err := grpcsvr.RunServer(ctx, logger, grpcServer, port, httpServer, opts...); err != nil { logger.Error(err, "error running server") os.Exit(1) @@ -114,6 +125,8 @@ func init() { serverCmd.PersistentFlags().StringVar(&rsPubKey, "rsPubKey", "", "RS public key") serverCmd.PersistentFlags().DurationVar(&bmcTimeout, "bmcTimeout", oob.DefaultBMCTimeout, "Timeout for BMC calls") serverCmd.PersistentFlags().StringVar(&skipRedfishVersions, "skipRedfishVersions", "", "Ignore the redfish endpoint on BMCs running the given version(s)") + serverCmd.PersistentFlags().IntVar(&maxWorkers, "maxWorkers", 1000, "Maximum number of concurrent workers that will be allowed to handle bmc tasks") + serverCmd.PersistentFlags().DurationVar(&workerIdleTimeout, "workerIdleTimeout", 30*time.Second, "Idle timeout for workers. If no tasks are received within the timeout, the worker will exit. New tasks will spawn a new worker if there isn't a worker running") rootCmd.AddCommand(serverCmd) } diff --git a/grpc/rpc/bmc_test.go b/grpc/rpc/bmc_test.go index 71b9053..b1c3af9 100644 --- a/grpc/rpc/bmc_test.go +++ b/grpc/rpc/bmc_test.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/philippgille/gokv" @@ -41,7 +42,7 @@ func setup() { Ctx: ctx, } - tr = taskrunner.NewRunner(repo) + tr = taskrunner.NewRunner(repo, 100, time.Second) tr.Start(ctx) bmcService = BmcService{ TaskRunner: tr, diff --git a/grpc/server.go b/grpc/server.go index d48134c..7a15d5f 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -36,6 +36,10 @@ type Server struct { // // for more information see https://github.com/bmc-toolbox/bmclib#bmc-connections skipRedfishVersions []string + // maxWorkers is the maximum number of concurrent workers that will be allowed to handle bmc tasks. + maxWorkers int + // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. + workerIdleTimeout time.Duration } // ServerOption for setting optional values. @@ -56,6 +60,18 @@ func WithSkipRedfishVersions(versions []string) ServerOption { return func(args *Server) { args.skipRedfishVersions = versions } } +// WithMaxWorkers sets the max number of of concurrent workers that handle bmc tasks.. +func WithMaxWorkers(max int) ServerOption { + return func(args *Server) { args.maxWorkers = max } +} + +// WithWorkerIdleTimeout sets the idle timeout for workers. +// If no tasks are received within the timeout, the worker will exit. +// New tasks will spawn a new worker if there isn't a worker running. +func WithWorkerIdleTimeout(t time.Duration) ServerOption { + return func(args *Server) { args.workerIdleTimeout = t } +} + // RunServer registers all services and runs the server. func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, port string, httpServer *http.Server, opts ...ServerOption) error { ctx, cancel := context.WithCancel(ctx) @@ -70,15 +86,17 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } defaultServer := &Server{ - Actions: repo, - bmcTimeout: oob.DefaultBMCTimeout, + Actions: repo, + bmcTimeout: oob.DefaultBMCTimeout, + maxWorkers: 1000, + workerIdleTimeout: time.Second * 30, } for _, opt := range opts { opt(defaultServer) } - tr := taskrunner.NewRunner(repo) + tr := taskrunner.NewRunner(repo, defaultServer.maxWorkers, defaultServer.workerIdleTimeout) tr.Start(ctx) ms := rpc.MachineService{ diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index 46d1c19..bdc55df 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -7,8 +7,9 @@ import ( ) type orchestrator struct { - workers sync.Map - manager *concurrencyManager + workers sync.Map + manager *concurrencyManager + workerIdleTimeout time.Duration fifoQueue *hostQueue ingestionQueue *IngestQueue @@ -119,9 +120,9 @@ func (r *Runner) worker(ctx context.Context, hostID string) { tChan := make(chan Task) cctx, stop := context.WithCancel(ctx) defer stop() - defer close(tChan) - go func() { + go func(ch chan Task) { + defer close(ch) for { select { case <-cctx.Done(): @@ -134,7 +135,7 @@ func (r *Runner) worker(ctx context.Context, hostID string) { tChan <- task } } - }() + }(tChan) for { select { @@ -143,7 +144,7 @@ func (r *Runner) worker(ctx context.Context, hostID string) { return case t := <-tChan: r.process(ctx, t.Log, t.Description, t.ID, t.Action) - case <-time.After(2 * time.Second): + case <-time.After(r.orchestrator.workerIdleTimeout): stop() return } diff --git a/grpc/taskrunner/run_test.go b/grpc/taskrunner/run_test.go index 9ef2289..06a6562 100644 --- a/grpc/taskrunner/run_test.go +++ b/grpc/taskrunner/run_test.go @@ -3,6 +3,7 @@ package taskrunner import ( "context" "testing" + "time" "github.com/philippgille/gokv" "github.com/philippgille/gokv/freecache" @@ -19,7 +20,7 @@ func TestStart(t *testing.T) { s := gokv.Store(f) repo := &persistence.GoKV{Store: s, Ctx: ctx} - r := NewRunner(repo) + r := NewRunner(repo, 100, time.Second) r.Start(ctx) r.orchestrator.manager.WaitAllDone() diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index babb0cf..2a3ed88 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -7,6 +7,7 @@ import ( "sync" "sync/atomic" "syscall" + "time" "github.com/go-logr/logr" "github.com/pkg/errors" @@ -23,13 +24,14 @@ type Runner struct { orchestrator *orchestrator } -func NewRunner(repo repository.Actions) *Runner { +func NewRunner(repo repository.Actions, maxWorkers int, workerIdleTimeout time.Duration) *Runner { o := &orchestrator{ fifoQueue: newHostQueue(), ingestionQueue: NewIngestQueue(), // perIDQueue is a map of hostID to a channel of tasks. perIDQueue: sync.Map{}, - manager: newManager(395), + manager: newManager(maxWorkers), + workerIdleTimeout: workerIdleTimeout, } return &Runner{ diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index 066459e..b7c9be6 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -26,7 +26,7 @@ func TestRoundTrip(t *testing.T) { defer s.Close() repo := &persistence.GoKV{Store: s, Ctx: ctx} logger := logr.Discard() - runner := NewRunner(repo) + runner := NewRunner(repo, 100, time.Second) runner.Start(ctx) time.Sleep(time.Millisecond * 100) From 1cb6b99522ead11f133461d2c39f920907da62b9 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Tue, 5 Sep 2023 16:37:48 -0600 Subject: [PATCH 14/20] Fix linting issue Signed-off-by: Jacob Weinstock --- grpc/taskrunner/taskrunner.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 2a3ed88..d655799 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -29,8 +29,8 @@ func NewRunner(repo repository.Actions, maxWorkers int, workerIdleTimeout time.D fifoQueue: newHostQueue(), ingestionQueue: NewIngestQueue(), // perIDQueue is a map of hostID to a channel of tasks. - perIDQueue: sync.Map{}, - manager: newManager(maxWorkers), + perIDQueue: sync.Map{}, + manager: newManager(maxWorkers), workerIdleTimeout: workerIdleTimeout, } From 5bbd2890158ff727591e5e05ca2d99ed072a5cbe Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Wed, 6 Sep 2023 21:33:35 -0600 Subject: [PATCH 15/20] Add metrics, tune for better ingestion: The ingestor was single threaded. After adding metrics and watching the queues and processing happen, running the ingestor process with a max goroutine cap yielded better performance. The per ID queues filled up faster and allowed processing of BMC interactions faster. Signed-off-by: Jacob Weinstock --- go.mod | 4 +-- go.sum | 11 ++------ grpc/rpc/machine.go | 4 +-- grpc/taskrunner/queue.go | 2 +- grpc/taskrunner/run.go | 51 +++++++++++++++++++++-------------- grpc/taskrunner/taskrunner.go | 2 ++ pkg/metrics/metrics.go | 10 +++++++ 7 files changed, 49 insertions(+), 35 deletions(-) diff --git a/go.mod b/go.mod index 9371e03..0e1d67e 100644 --- a/go.mod +++ b/go.mod @@ -15,11 +15,9 @@ require ( github.com/golang-jwt/jwt/v4 v4.5.0 github.com/golang/protobuf v1.5.3 github.com/google/go-cmp v0.5.9 - github.com/gosuri/uilive v0.0.4 github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/hashicorp/go-multierror v1.1.1 - github.com/lnquy/elastic-worker-pool v0.0.1 github.com/manifoldco/promptui v0.9.0 github.com/mwitkow/go-proto-validators v0.3.2 github.com/onsi/gomega v1.27.6 @@ -93,7 +91,7 @@ require ( golang.org/x/crypto v0.6.0 // indirect golang.org/x/exp v0.0.0-20230212135524-a684f29349b6 // indirect golang.org/x/net v0.8.0 // indirect - golang.org/x/sys v0.6.0 // indirect + golang.org/x/sys v0.7.0 // indirect golang.org/x/text v0.8.0 // indirect google.golang.org/genproto v0.0.0-20230209215440-0dfe4f8abfcc // indirect gopkg.in/go-playground/validator.v9 v9.31.0 // indirect diff --git a/go.sum b/go.sum index cc5ce5b..26c555c 100644 --- a/go.sum +++ b/go.sum @@ -215,8 +215,6 @@ github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+ github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= -github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY= -github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI= github.com/grpc-ecosystem/go-grpc-middleware v1.2.2/go.mod h1:EaizFBKfUKtMIF5iaDEhniwNedqGo9FuLFzppDr3uwI= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 h1:UH//fgunKIs4JdUbpDl1VZCDaL56wXCB/5+wF6uHfaI= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0/go.mod h1:g5qyo/la0ALbONm6Vbp88Yd8NsDy6rZz+RcrMPxvld8= @@ -261,8 +259,6 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.2.1 h1:BqpAaACuzVSgi/VLzGZIobT2z4v53pjosyNd9Yv6n/w= github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY= -github.com/lnquy/elastic-worker-pool v0.0.1 h1:XmFImqsFAVkqPAb+BvAtdqn32lL/D445AlUn+r36MwI= -github.com/lnquy/elastic-worker-pool v0.0.1/go.mod h1:yw7YmwSJ1mlY7w8qLvhsXegi3gp4s19XmocsmlViyv4= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/manifoldco/promptui v0.9.0 h1:3V4HzJk1TtXW1MTZMP7mdlwbBpIinw3HztaIlYthEiA= @@ -327,7 +323,6 @@ github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= -github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= @@ -402,7 +397,6 @@ go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.18.1/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= goa.design/goa v2.2.5+incompatible h1:mjAtiy7ZdZIkj974hpFxCR6bL69qprfV00Veu3Vybts= goa.design/goa v2.2.5+incompatible/go.mod h1:NnzBwdNktihbNek+pPiFMQP9PPFsUt8MMPPyo9opDSo= -golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -507,7 +501,6 @@ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -555,8 +548,8 @@ golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= -golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.6.0 h1:clScbb1cHjoCkyRbWwBEUZ5H/tIFu5TAXIqaZD0Gcjw= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/grpc/rpc/machine.go b/grpc/rpc/machine.go index 69c1afa..a799d81 100644 --- a/grpc/rpc/machine.go +++ b/grpc/rpc/machine.go @@ -54,7 +54,7 @@ func (m *MachineService) BootDevice(ctx context.Context, in *v1.DeviceRequest) ( defer cancel() return mbd.BootDeviceSet(taskCtx, in.BootDevice.String(), in.Persistent, in.EfiBoot) } - m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) + go m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.DeviceResponse{TaskId: taskID}, nil } @@ -89,7 +89,7 @@ func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.Po defer cancel() return mp.PowerSet(taskCtx, in.PowerAction.String()) } - m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) + go m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.PowerResponse{TaskId: taskID}, nil } diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index 69d4375..dca57ea 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -19,7 +19,7 @@ type Task struct { func NewIngestQueue() *IngestQueue { return &IngestQueue{ - q: queue.NewBlocking([]*Task{}, queue.WithCapacity(10000)), + q: queue.NewBlocking([]*Task{}), } } diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index bdc55df..9f79f4c 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -4,6 +4,8 @@ import ( "context" "sync" "time" + + "github.com/tinkerbell/pbnj/pkg/metrics" ) type orchestrator struct { @@ -11,6 +13,8 @@ type orchestrator struct { manager *concurrencyManager workerIdleTimeout time.Duration + ingestManager *concurrencyManager + fifoQueue *hostQueue ingestionQueue *IngestQueue // perIDQueue is a map of hostID to a channel of tasks. @@ -28,28 +32,34 @@ func (o *orchestrator) ingest(ctx context.Context) { case <-ctx.Done(): return default: - // 1. dequeue from ingestion queue - // 2. enqueue to perID queue - // 3. enqueue to fcfs queue - // --- - // 1. dequeue from ingestion queue - t, err := o.ingestionQueue.Dequeue() - if err != nil { - continue - } + o.ingestManager.Wait() + go func() { + defer o.ingestManager.Done() + // 1. dequeue from ingestion queue + // 2. enqueue to perID queue + // 3. enqueue to fcfs queue + // --- + // 1. dequeue from ingestion queue + t, err := o.ingestionQueue.Dequeue() + if err != nil { + return + } + metrics.IngestionQueue.Dec() - // 2. enqueue to perID queue - // hostCh := make(chan Task) - que := NewIngestQueue() - q, _ := o.perIDQueue.LoadOrStore(t.Host, que) - v, ok := q.(*IngestQueue) - if !ok { - continue - } - v.Enqueue(t) + // 2. enqueue to perID queue + que := NewIngestQueue() + q, _ := o.perIDQueue.LoadOrStore(t.Host, que) + v, ok := q.(*IngestQueue) + if !ok { + return + } + v.Enqueue(t) + metrics.PerIDQueue.WithLabelValues(t.Host).Inc() + + // 3. enqueue to fcfs queue + o.fifoQueue.Enqueue(host(t.Host)) + }() - // 3. enqueue to fcfs queue - o.fifoQueue.Enqueue(host(t.Host)) } } } @@ -133,6 +143,7 @@ func (r *Runner) worker(ctx context.Context, hostID string) { continue } tChan <- task + metrics.PerIDQueue.WithLabelValues(hostID).Dec() } } }(tChan) diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index d655799..81717c5 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -28,6 +28,7 @@ func NewRunner(repo repository.Actions, maxWorkers int, workerIdleTimeout time.D o := &orchestrator{ fifoQueue: newHostQueue(), ingestionQueue: NewIngestQueue(), + ingestManager: newManager(1000), // perIDQueue is a map of hostID to a channel of tasks. perIDQueue: sync.Map{}, manager: newManager(maxWorkers), @@ -66,6 +67,7 @@ func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, } r.orchestrator.ingestionQueue.Enqueue(i) + metrics.IngestionQueue.Inc() } func (r *Runner) updateMessages(ctx context.Context, taskID string, ch chan string) { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 8f403e1..64e6145 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -9,6 +9,8 @@ var ( ActionDuration prometheus.ObserverVec TasksTotal prometheus.Counter TasksActive prometheus.Gauge + PerIDQueue prometheus.GaugeVec + IngestionQueue prometheus.Gauge ) func init() { @@ -36,6 +38,14 @@ func init() { Name: "pbnj_tasks_active", Help: "Number of tasks currently active.", }) + PerIDQueue = *promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "pbnj_per_id_queue", + Help: "Number of tasks in perID queue.", + }, []string{"host"}) + IngestionQueue = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_ingestion_queue", + Help: "Number of tasks in ingestion queue.", + }) } func initObserverLabels(m prometheus.ObserverVec, l []prometheus.Labels) { From c4c4e01133c80c5c26726f04e2449a697e077425 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Wed, 6 Sep 2023 21:37:11 -0600 Subject: [PATCH 16/20] Fix linting issue Signed-off-by: Jacob Weinstock --- grpc/taskrunner/run.go | 1 - 1 file changed, 1 deletion(-) diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index 9f79f4c..1e46f61 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -59,7 +59,6 @@ func (o *orchestrator) ingest(ctx context.Context) { // 3. enqueue to fcfs queue o.fifoQueue.Enqueue(host(t.Host)) }() - } } } From 5d91df18e270bcef8335a8d8e90c86dd92686603 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Thu, 7 Sep 2023 15:19:49 -0600 Subject: [PATCH 17/20] Add a max number of concurrent workers for the ingestion: This keeps the goroutines from growning uncontrollably. Signed-off-by: Jacob Weinstock --- cmd/server.go | 4 + grpc/rpc/bmc_test.go | 2 +- grpc/server.go | 20 +++-- grpc/taskrunner/run_test.go | 127 ----------------------------- grpc/taskrunner/taskrunner.go | 12 ++- grpc/taskrunner/taskrunner_test.go | 2 +- 6 files changed, 31 insertions(+), 136 deletions(-) delete mode 100644 grpc/taskrunner/run_test.go diff --git a/cmd/server.go b/cmd/server.go index 9092973..71f42af 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -52,6 +52,9 @@ var ( maxWorkers int // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. workerIdleTimeout time.Duration + // maxIngestionWorkers is the maximum number of concurrent workers that will be allowed. + // These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues. + maxIngestionWorkers int // serverCmd represents the server command. serverCmd = &cobra.Command{ Use: "server", @@ -127,6 +130,7 @@ func init() { serverCmd.PersistentFlags().StringVar(&skipRedfishVersions, "skipRedfishVersions", "", "Ignore the redfish endpoint on BMCs running the given version(s)") serverCmd.PersistentFlags().IntVar(&maxWorkers, "maxWorkers", 1000, "Maximum number of concurrent workers that will be allowed to handle bmc tasks") serverCmd.PersistentFlags().DurationVar(&workerIdleTimeout, "workerIdleTimeout", 30*time.Second, "Idle timeout for workers. If no tasks are received within the timeout, the worker will exit. New tasks will spawn a new worker if there isn't a worker running") + serverCmd.PersistentFlags().IntVar(&maxIngestionWorkers, "maxIngestionWorkers", 1000, "Maximum number of concurrent workers that will be allowed. These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues") rootCmd.AddCommand(serverCmd) } diff --git a/grpc/rpc/bmc_test.go b/grpc/rpc/bmc_test.go index b1c3af9..82f03a2 100644 --- a/grpc/rpc/bmc_test.go +++ b/grpc/rpc/bmc_test.go @@ -42,7 +42,7 @@ func setup() { Ctx: ctx, } - tr = taskrunner.NewRunner(repo, 100, time.Second) + tr = taskrunner.NewRunner(repo, 100, 100, time.Second) tr.Start(ctx) bmcService = BmcService{ TaskRunner: tr, diff --git a/grpc/server.go b/grpc/server.go index 7a15d5f..9e984da 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -40,6 +40,9 @@ type Server struct { maxWorkers int // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. workerIdleTimeout time.Duration + // maxIngestionWorkers is the maximum number of concurrent workers that will be allowed. + // These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues. + maxIngestionWorkers int } // ServerOption for setting optional values. @@ -72,6 +75,12 @@ func WithWorkerIdleTimeout(t time.Duration) ServerOption { return func(args *Server) { args.workerIdleTimeout = t } } +// WithMaxIngestionWorkers sets the max number of concurrent workers that will be allowed. +// These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues. +func WithMaxIngestionWorkers(max int) ServerOption { + return func(args *Server) { args.maxIngestionWorkers = max } +} + // RunServer registers all services and runs the server. func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, port string, httpServer *http.Server, opts ...ServerOption) error { ctx, cancel := context.WithCancel(ctx) @@ -86,17 +95,18 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } defaultServer := &Server{ - Actions: repo, - bmcTimeout: oob.DefaultBMCTimeout, - maxWorkers: 1000, - workerIdleTimeout: time.Second * 30, + Actions: repo, + bmcTimeout: oob.DefaultBMCTimeout, + maxWorkers: 1000, + workerIdleTimeout: time.Second * 30, + maxIngestionWorkers: 1000, } for _, opt := range opts { opt(defaultServer) } - tr := taskrunner.NewRunner(repo, defaultServer.maxWorkers, defaultServer.workerIdleTimeout) + tr := taskrunner.NewRunner(repo, defaultServer.maxIngestionWorkers, defaultServer.maxWorkers, defaultServer.workerIdleTimeout) tr.Start(ctx) ms := rpc.MachineService{ diff --git a/grpc/taskrunner/run_test.go b/grpc/taskrunner/run_test.go deleted file mode 100644 index 06a6562..0000000 --- a/grpc/taskrunner/run_test.go +++ /dev/null @@ -1,127 +0,0 @@ -package taskrunner - -import ( - "context" - "testing" - "time" - - "github.com/philippgille/gokv" - "github.com/philippgille/gokv/freecache" - "github.com/tinkerbell/pbnj/grpc/persistence" -) - -func TestStart(t *testing.T) { - t.Skip("skipping test") - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - f := freecache.NewStore(freecache.DefaultOptions) - s := gokv.Store(f) - repo := &persistence.GoKV{Store: s, Ctx: ctx} - - r := NewRunner(repo, 100, time.Second) - r.Start(ctx) - - r.orchestrator.manager.WaitAllDone() - cancel() - t.Fatal() -} - -/* -func lenSyncMap(m *sync.Map) int { - var i int - m.Range(func(k, v interface{}) bool { - i++ - return true - }) - return i -} - -func (o *orchestrator) observe(ctx context.Context) { - writer := uilive.New() - //writer.Out = fileWriter() - - // start listening for updates and render - writer.Start() - defer writer.Stop() - workers := func() string { - s := []string{} - for _, h := range hostList { - v, ok := o.perIDQueue.Load(h) - if ok { - s = append(s, fmt.Sprintf("worker (%v) queue len: %v", h, v.(*IngestQueue).Size())) - } - } - return strings.Join(s, "\n") + "\n" - } - for { - select { - case <-ctx.Done(): - writer.Flush() - return - default: - one := fmt.Sprintf("running workers count: %v\n", o.manager.RunningCount()) - two := fmt.Sprintf("fifoQueue queue length: %v\n", o.fifoQueue.q.Size()) - three := fmt.Sprintf("goroutines: %v\n", runtime.NumGoroutine()) - four := fmt.Sprintf("worker map: %v\n", lenSyncMap(&o.workers)) - all := fmt.Sprintf("%v%v%v%v%v", one, two, three, four, workers()) - fmt.Fprint(writer, all) - <-time.After(time.Millisecond * 50) - } - } -} - -func (o *orchestrator) load() { - for i := 0; i < 1000; i++ { - i := i - go func() { - o.ingestionQueue.Enqueue(Task{ - ID: strconv.Itoa(i), - Host: hostList[rand.Intn(len(hostList))], - Description: "", - Action: func(chan string) (string, error) { - time.Sleep(time.Millisecond * 250) - return "", nil - }, - Log: logr.Discard(), - }) - }() - } -} - -var hostList = []string{ - "127.0.0.1", - "127.0.0.2", - "127.0.0.3", - "127.0.0.4", - "127.0.0.5", - "127.0.0.6", - "127.0.0.7", - "127.0.0.8", - "127.0.0.9", - "127.0.0.10", - "127.0.0.11", - "127.0.0.12", - "127.0.0.13", - "127.0.0.14", - "127.0.0.15", - "127.0.0.16", - "127.0.0.17", - "127.0.0.18", - "127.0.0.19", - "127.0.0.20", - "127.0.0.21", - "127.0.0.22", - "127.0.0.23", - "127.0.0.24", - "127.0.0.25", - "127.0.0.26", - "127.0.0.27", - "127.0.0.28", - "127.0.0.29", - "127.0.0.30", - "127.0.0.31", - "127.0.0.32", -} -*/ diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 81717c5..19f3d8f 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -24,11 +24,19 @@ type Runner struct { orchestrator *orchestrator } -func NewRunner(repo repository.Actions, maxWorkers int, workerIdleTimeout time.Duration) *Runner { +// NewRunner returns a task runner that manages tasks, workers, queues, and persistence. +// +// maxIngestionWorkers is the maximum number of concurrent workers that will be allowed. +// These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues. +// +// maxWorkers is the maximum number of concurrent workers that will be allowed to handle bmc tasks. +// +// workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. +func NewRunner(repo repository.Actions, maxIngestionWorkers, maxWorkers int, workerIdleTimeout time.Duration) *Runner { o := &orchestrator{ fifoQueue: newHostQueue(), ingestionQueue: NewIngestQueue(), - ingestManager: newManager(1000), + ingestManager: newManager(maxIngestionWorkers), // perIDQueue is a map of hostID to a channel of tasks. perIDQueue: sync.Map{}, manager: newManager(maxWorkers), diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index b7c9be6..eab9352 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -26,7 +26,7 @@ func TestRoundTrip(t *testing.T) { defer s.Close() repo := &persistence.GoKV{Store: s, Ctx: ctx} logger := logr.Discard() - runner := NewRunner(repo, 100, time.Second) + runner := NewRunner(repo, 100, 100, time.Second) runner.Start(ctx) time.Sleep(time.Millisecond * 100) From 4877220b61e9067bf76e30ca234ab5bef81a03a6 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Fri, 8 Sep 2023 23:43:30 -0600 Subject: [PATCH 18/20] WIP refactor to use channels instead of the queue: This made processing significantly faster. Signed-off-by: Jacob Weinstock --- cmd/server.go | 14 ++-- grpc/rpc/machine.go | 4 +- grpc/server.go | 15 +++- grpc/taskrunner/queue.go | 64 +++++++++++++-- grpc/taskrunner/run.go | 144 +++++++++++++--------------------- grpc/taskrunner/taskrunner.go | 43 +++++++--- pkg/http/http.go | 14 +++- pkg/metrics/metrics.go | 42 ++++++++-- 8 files changed, 216 insertions(+), 124 deletions(-) diff --git a/cmd/server.go b/cmd/server.go index 71f42af..30b05c1 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -3,6 +3,7 @@ package cmd import ( "context" "errors" + "fmt" "os" "strings" "time" @@ -98,19 +99,18 @@ var ( httpServer := http.NewServer(metricsAddr) httpServer.WithLogger(logger) - opts := []grpcsvr.ServerOption{grpcsvr.WithBmcTimeout(bmcTimeout)} + opts := []grpcsvr.ServerOption{ + grpcsvr.WithBmcTimeout(bmcTimeout), + grpcsvr.WithMaxWorkers(maxWorkers), + grpcsvr.WithWorkerIdleTimeout(workerIdleTimeout), + } if skipRedfishVersions != "" { versions := strings.Split(skipRedfishVersions, ",") opts = append(opts, grpcsvr.WithSkipRedfishVersions(versions)) } - if maxWorkers > 0 { - opts = append(opts, grpcsvr.WithMaxWorkers(maxWorkers)) - } - if workerIdleTimeout > 0 { - opts = append(opts, grpcsvr.WithWorkerIdleTimeout(workerIdleTimeout)) - } + fmt.Println("maxWorkers", maxWorkers) if err := grpcsvr.RunServer(ctx, logger, grpcServer, port, httpServer, opts...); err != nil { logger.Error(err, "error running server") diff --git a/grpc/rpc/machine.go b/grpc/rpc/machine.go index a799d81..d09837b 100644 --- a/grpc/rpc/machine.go +++ b/grpc/rpc/machine.go @@ -64,14 +64,14 @@ func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.Po l := logging.ExtractLogr(ctx) taskID := xid.New().String() l = l.WithValues("taskID", taskID, "bmcIP", in.GetAuthn().GetDirectAuthn().GetHost().GetHost()) - l.Info( + /*l.Info( "start Power request", "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "powerAction", in.GetPowerAction().String(), "softTimeout", in.SoftTimeout, "OffDuration", in.OffDuration, - ) + )*/ execFunc := func(s chan string) (string, error) { mp, err := machine.NewPowerSetter( diff --git a/grpc/server.go b/grpc/server.go index 9e984da..c6d7ac7 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -2,9 +2,11 @@ package grpc import ( "context" + "fmt" "net" "os" "os/signal" + "syscall" "time" "github.com/go-logr/logr" @@ -105,6 +107,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po for _, opt := range opts { opt(defaultServer) } + fmt.Printf("maxWorkers: %d\n", defaultServer.maxWorkers) tr := taskrunner.NewRunner(repo, defaultServer.maxIngestionWorkers, defaultServer.maxWorkers, defaultServer.workerIdleTimeout) tr.Start(ctx) @@ -144,7 +147,7 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po reflection.Register(grpcServer) go func() { - err := httpServer.Run() + err := httpServer.Run(ctx) if err != nil { log.Error(err, "failed to serve http") os.Exit(1) //nolint:revive // removing deep-exit requires a significant refactor @@ -162,6 +165,16 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } }() + msgChan := make(chan os.Signal) + signal.Notify(msgChan, syscall.SIGUSR1) + go func() { + for range msgChan { + fmt.Println("======") + tr.Print() + fmt.Println("======") + } + }() + go func() { <-ctx.Done() log.Info("ctx cancelled, shutting down PBnJ") diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go index dca57ea..0f490bc 100644 --- a/grpc/taskrunner/queue.go +++ b/grpc/taskrunner/queue.go @@ -1,6 +1,8 @@ package taskrunner import ( + "context" + "github.com/adrianbrad/queue" "github.com/go-logr/logr" ) @@ -10,11 +12,38 @@ type IngestQueue struct { } type Task struct { - ID string `json:"id"` - Host string `json:"host"` - Description string `json:"description"` - Action func(chan string) (string, error) `json:"-"` - Log logr.Logger `json:"-"` + ID string `json:"id"` + Host string `json:"host"` + Description string `json:"description"` + Action func(chan string) (string, error) `json:"-"` + Log logr.Logger `json:"-"` +} + +func NewFIFOChannelQueue() *IngestQueue { + return &IngestQueue{ + q: queue.NewBlocking([]*Task{}), + } +} + +// Enqueue inserts the item into the queue. +func (i *IngestQueue) Enqueue2(item Task) { + i.q.OfferWait(&item) +} + +// Dequeue removes the oldest element from the queue. FIFO. +func (i *IngestQueue) Dequeue2(ctx context.Context, tChan chan Task) { + for { + select { + case <-ctx.Done(): + return + default: + item, err := i.q.Get() + if err != nil { + continue + } + tChan <- *item + } + } } func NewIngestQueue() *IngestQueue { @@ -44,7 +73,8 @@ func (i *IngestQueue) Size() int { func newHostQueue() *hostQueue { return &hostQueue{ - q: queue.NewBlocking[host]([]host{}, queue.WithCapacity(10000)), + q: queue.NewBlocking[host]([]host{}), + ch: make(chan host), } } @@ -55,7 +85,8 @@ func (h host) String() string { } type hostQueue struct { - q *queue.Blocking[host] + q *queue.Blocking[host] + ch chan host } // Enqueue inserts the item into the queue. @@ -73,6 +104,25 @@ func (i *hostQueue) Dequeue() (host, error) { return item, nil } +// Dequeue removes the oldest element from the queue. FIFO. +func (i *hostQueue) Dequeue2(ctx context.Context) <-chan host { + go func() { + for { + select { + case <-ctx.Done(): + return + default: + item, err := i.q.Get() + if err != nil { + continue + } + i.ch <- item + } + } + }() + return i.ch +} + func (i *hostQueue) Size() int { return i.q.Size() } diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index 1e46f61..c3585eb 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -2,6 +2,7 @@ package taskrunner import ( "context" + "fmt" "sync" "time" @@ -16,14 +17,32 @@ type orchestrator struct { ingestManager *concurrencyManager fifoQueue *hostQueue + fifoChan chan host ingestionQueue *IngestQueue // perIDQueue is a map of hostID to a channel of tasks. perIDQueue sync.Map + + //testing new stuff + ingestChan chan Task +} + +func (r *Runner) Print() { + one := r.orchestrator.ingestionQueue.Size() + two := r.orchestrator.fifoQueue.Size() + var three int + r.orchestrator.perIDQueue.Range(func(key, value interface{}) bool { + three++ + return true + }) + fmt.Printf("ingestion queue size: %d\n", one) + fmt.Printf("fcfs queue size: %d\n", two) + fmt.Printf("perID queue size: %d\n", three) } // ingest take a task off the ingestion queue and puts it on the perID queue // and adds the host ID to the fcfs queue. -func (o *orchestrator) ingest(ctx context.Context) { +func (r *Runner) ingest(ctx context.Context) { + //func (o *orchestrator) ingest(ctx context.Context) { // dequeue from ingestion queue // enqueue to perID queue // enqueue to fcfs queue @@ -31,34 +50,24 @@ func (o *orchestrator) ingest(ctx context.Context) { select { case <-ctx.Done(): return - default: - o.ingestManager.Wait() - go func() { - defer o.ingestManager.Done() - // 1. dequeue from ingestion queue - // 2. enqueue to perID queue - // 3. enqueue to fcfs queue - // --- - // 1. dequeue from ingestion queue - t, err := o.ingestionQueue.Dequeue() - if err != nil { - return - } - metrics.IngestionQueue.Dec() - - // 2. enqueue to perID queue - que := NewIngestQueue() - q, _ := o.perIDQueue.LoadOrStore(t.Host, que) - v, ok := q.(*IngestQueue) - if !ok { - return - } - v.Enqueue(t) - metrics.PerIDQueue.WithLabelValues(t.Host).Inc() - - // 3. enqueue to fcfs queue - o.fifoQueue.Enqueue(host(t.Host)) - }() + case t := <-r.orchestrator.ingestChan: + + // 2. enqueue to perID queue + ch := make(chan Task, 5000) + q, exists := r.orchestrator.perIDQueue.LoadOrStore(t.Host, ch) + v, ok := q.(chan Task) + if !ok { + fmt.Println("bad type: IngestQueue") + return + } + if exists { + close(ch) + } + v <- t + metrics.PerIDQueue.WithLabelValues(t.Host).Inc() + metrics.IngestionQueue.Dec() + metrics.NumPerIDEnqueued.Inc() + r.orchestrator.workers.Store(t.Host, false) } } } @@ -71,91 +80,46 @@ func (r *Runner) orchestrate(ctx context.Context) { // 1. dequeue from fcfs queue // 2. start workers for { - select { - case <-ctx.Done(): - return - default: - // 1. dequeue from fcfs queue - // 2. if worker id exists in o.workers, then move on - // 2a. if len(o.workers) < o.maxWorkers then create worker and move on, else recursion. - h, err := r.orchestrator.fifoQueue.Dequeue() - if err != nil { - continue - } - - // check queue length for perID queue, if 0, then continue - if elem, ok := r.orchestrator.perIDQueue.Load(h.String()); ok { - v, ok := elem.(*IngestQueue) - if !ok { - continue - } - if v.Size() == 0 { - continue - } - } - + time.Sleep(time.Second * 2) + // r.orchestrator.perIDQueue.Range(func(key, value interface{}) bool { + r.orchestrator.workers.Range(func(key, value interface{}) bool { // if worker id exists in o.workers, then move on because the worker is already running. - _, ok := r.orchestrator.workers.Load(h) - if ok { - continue + if value.(bool) { + return true } // wait for a worker to become available r.orchestrator.manager.Wait() - r.orchestrator.workers.Store(h, true) - go r.worker(ctx, h.String()) - } + r.orchestrator.workers.Store(key.(string), true) + v, _ := r.orchestrator.perIDQueue.Load(key.(string)) + go r.worker(ctx, key.(string), v.(chan Task)) + return true + }) } } -func (r *Runner) worker(ctx context.Context, hostID string) { +func (r *Runner) worker(ctx context.Context, hostID string, q chan Task) { defer r.orchestrator.manager.Done() defer func() { r.orchestrator.workers.Range(func(key, value interface{}) bool { - if key.(host) == host(hostID) { //nolint:forcetypeassert // good - r.orchestrator.workers.Delete(key) + if key.(string) == hostID { //nolint:forcetypeassert // good + r.orchestrator.workers.Delete(key.(string)) return true //nolint:revive // this is needed to satisfy the func parameter } return true //nolint:revive // this is needed to satisfy the func parameter }) - }() - elem, ok := r.orchestrator.perIDQueue.Load(hostID) - if !ok { - return - } - - tChan := make(chan Task) - cctx, stop := context.WithCancel(ctx) - defer stop() - - go func(ch chan Task) { - defer close(ch) - for { - select { - case <-cctx.Done(): - return - default: - task, err := elem.(*IngestQueue).Dequeue() - if err != nil { - continue - } - tChan <- task - metrics.PerIDQueue.WithLabelValues(hostID).Dec() - } - } - }(tChan) + }() for { select { case <-ctx.Done(): - stop() return - case t := <-tChan: + case t := <-q: r.process(ctx, t.Log, t.Description, t.ID, t.Action) + metrics.PerIDQueue.WithLabelValues(hostID).Dec() case <-time.After(r.orchestrator.workerIdleTimeout): - stop() return } } diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index 19f3d8f..bc88a4d 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -2,6 +2,7 @@ package taskrunner import ( "context" + "fmt" "net" "net/url" "sync" @@ -33,14 +34,18 @@ type Runner struct { // // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. func NewRunner(repo repository.Actions, maxIngestionWorkers, maxWorkers int, workerIdleTimeout time.Duration) *Runner { + fmt.Println("NewRunner", maxIngestionWorkers, maxWorkers, workerIdleTimeout) o := &orchestrator{ + workers: sync.Map{}, fifoQueue: newHostQueue(), + fifoChan: make(chan host, 5000), ingestionQueue: NewIngestQueue(), ingestManager: newManager(maxIngestionWorkers), // perIDQueue is a map of hostID to a channel of tasks. perIDQueue: sync.Map{}, manager: newManager(maxWorkers), workerIdleTimeout: workerIdleTimeout, + ingestChan: make(chan Task, 5000), } return &Runner{ @@ -60,22 +65,41 @@ func (r *Runner) TotalWorkers() int { } func (r *Runner) Start(ctx context.Context) { - go r.orchestrator.ingest(ctx) + go func() { + ticker := time.NewTicker(3 * time.Second) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + metrics.NumWorkers.Set(float64(r.orchestrator.manager.RunningCount())) + var size int + r.orchestrator.workers.Range(func(key, value interface{}) bool { + size++ + return true + }) + metrics.WorkerMap.Set(float64(size)) + } + } + }() + go r.ingest(ctx) go r.orchestrate(ctx) } // Execute a task, update repository with status. -func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, host string, action func(chan string) (string, error)) { +func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, hostID string, action func(chan string) (string, error)) { i := Task{ ID: taskID, - Host: host, + Host: hostID, Description: description, Action: action, Log: l, } - r.orchestrator.ingestionQueue.Enqueue(i) + //r.orchestrator.ingestionQueue.Enqueue(i) + r.orchestrator.ingestChan <- i metrics.IngestionQueue.Inc() + metrics.Ingested.Inc() } func (r *Runner) updateMessages(ctx context.Context, taskID string, ch chan string) { @@ -106,7 +130,8 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t r.active.Add(-1) }() - metrics.TasksTotal.Inc() + defer metrics.TasksTotal.Inc() + defer metrics.TotalGauge.Inc() metrics.TasksActive.Inc() defer metrics.TasksActive.Dec() @@ -130,7 +155,7 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t cctx, done := context.WithCancel(ctx) defer done() go r.updateMessages(cctx, taskID, messagesChan) - logger.Info("worker start") + //logger.Info("worker start") resultRecord := repository.Record{ State: "complete", @@ -154,7 +179,7 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t if errors.As(err, &foundErr) { resultRecord.Error = foundErr.StructuredError() } - logger.Error(err, "task completed with an error") + //logger.Error(err, "task completed with an error") } record, err := r.Repository.Get(taskID) if err != nil { @@ -167,10 +192,10 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t record.Error = resultRecord.Error if err := r.Repository.Update(taskID, record); err != nil { - logger.Error(err, "failed to update record") + //logger.Error(err, "failed to update record") } - logger.Info("worker complete", "complete", true) + //logger.Info("worker complete", "complete", true) } // Status returns the status record of a task. diff --git a/pkg/http/http.go b/pkg/http/http.go index 54007cf..6ce9167 100644 --- a/pkg/http/http.go +++ b/pkg/http/http.go @@ -1,6 +1,7 @@ package http import ( + "context" "net/http" "github.com/go-logr/logr" @@ -33,8 +34,17 @@ func (h *Server) init() { h.mux.HandleFunc("/_/live", h.handleLive) } -func (h *Server) Run() error { - return http.ListenAndServe(h.address, h.mux) //nolint:gosec // TODO: add handle timeouts +func (h *Server) Run(ctx context.Context) error { + svr := &http.Server{Addr: h.address, Handler: h.mux} + svr.ListenAndServe() + + go func() { + <-ctx.Done() + svr.Shutdown(ctx) + }() + + return svr.ListenAndServe() + // return http.ListenAndServe(h.address, h.mux) //nolint:gosec // TODO: add handle timeouts } func NewServer(addr string) *Server { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 64e6145..76aa4e8 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -6,11 +6,17 @@ import ( ) var ( - ActionDuration prometheus.ObserverVec - TasksTotal prometheus.Counter - TasksActive prometheus.Gauge - PerIDQueue prometheus.GaugeVec - IngestionQueue prometheus.Gauge + ActionDuration prometheus.ObserverVec + TasksTotal prometheus.Counter + TotalGauge prometheus.Gauge + TasksActive prometheus.Gauge + PerIDQueue prometheus.GaugeVec + IngestionQueue prometheus.Gauge + Ingested prometheus.Gauge + FIFOQueue prometheus.Gauge + NumWorkers prometheus.Gauge + NumPerIDEnqueued prometheus.Gauge + WorkerMap prometheus.Gauge ) func init() { @@ -31,7 +37,7 @@ func init() { initObserverLabels(ActionDuration, labelValues) TasksTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "pbnj_tasks_total", + Name: "pbnj_tasks_processes", Help: "Total number of tasks executed.", }) TasksActive = promauto.NewGauge(prometheus.GaugeOpts{ @@ -46,6 +52,30 @@ func init() { Name: "pbnj_ingestion_queue", Help: "Number of tasks in ingestion queue.", }) + Ingested = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_ingested", + Help: "Number of tasks ingested.", + }) + FIFOQueue = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_fifo_queue", + Help: "Number of tasks in FIFO queue.", + }) + TotalGauge = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_total", + Help: "Total number of tasks.", + }) + NumWorkers = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_num_workers", + Help: "Number of workers.", + }) + NumPerIDEnqueued = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_num_per_id_enqueued", + Help: "Number of perID enqueued.", + }) + WorkerMap = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "pbnj_worker_map_size", + Help: "Worker map size.", + }) } func initObserverLabels(m prometheus.ObserverVec, l []prometheus.Labels) { From e9d0f2fd171403c35817b93f63faa038724dfd64 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Sat, 9 Sep 2023 19:27:32 -0600 Subject: [PATCH 19/20] Clean up unused code: Update the metric server to handle slow loris attacks. Signed-off-by: Jacob Weinstock --- cmd/server.go | 3 - go.mod | 1 - go.sum | 2 - grpc/rpc/bmc_test.go | 2 +- grpc/rpc/machine.go | 8 +- grpc/rpc/task_test.go | 3 +- grpc/server.go | 33 ++------ grpc/taskrunner/queue.go | 128 ----------------------------- grpc/taskrunner/run.go | 44 +++------- grpc/taskrunner/taskrunner.go | 28 +++---- grpc/taskrunner/taskrunner_test.go | 4 +- pkg/http/http.go | 14 +++- 12 files changed, 47 insertions(+), 223 deletions(-) delete mode 100644 grpc/taskrunner/queue.go diff --git a/cmd/server.go b/cmd/server.go index 30b05c1..fb0f2f4 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -3,7 +3,6 @@ package cmd import ( "context" "errors" - "fmt" "os" "strings" "time" @@ -110,8 +109,6 @@ var ( opts = append(opts, grpcsvr.WithSkipRedfishVersions(versions)) } - fmt.Println("maxWorkers", maxWorkers) - if err := grpcsvr.RunServer(ctx, logger, grpcServer, port, httpServer, opts...); err != nil { logger.Error(err, "error running server") os.Exit(1) diff --git a/go.mod b/go.mod index 0e1d67e..c947741 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/tinkerbell/pbnj go 1.20 require ( - github.com/adrianbrad/queue v1.2.1 github.com/bmc-toolbox/bmclib v0.5.7 github.com/bmc-toolbox/bmclib/v2 v2.0.1-0.20230515164712-2714c7479477 github.com/cristalhq/jwt/v3 v3.1.0 diff --git a/go.sum b/go.sum index 26c555c..a39eb31 100644 --- a/go.sum +++ b/go.sum @@ -46,8 +46,6 @@ github.com/VictorLowther/simplexml v0.0.0-20180716164440-0bff93621230 h1:t95Grn2 github.com/VictorLowther/simplexml v0.0.0-20180716164440-0bff93621230/go.mod h1:t2EzW1qybnPDQ3LR/GgeF0GOzHUXT5IVMLP2gkW1cmc= github.com/VictorLowther/soap v0.0.0-20150314151524-8e36fca84b22 h1:a0MBqYm44o0NcthLKCljZHe1mxlN6oahCQHHThnSwB4= github.com/VictorLowther/soap v0.0.0-20150314151524-8e36fca84b22/go.mod h1:/B7V22rcz4860iDqstGvia/2+IYWXf3/JdQCVd/1D2A= -github.com/adrianbrad/queue v1.2.1 h1:CEVsjFQyuR0s5Hty/HJGWBZHsJ3KMmii0kEgLeam/mk= -github.com/adrianbrad/queue v1.2.1/go.mod h1:wYiPC/3MPbyT45QHLrPR4zcqJWPePubM1oEP/xTwhUs= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/grpc/rpc/bmc_test.go b/grpc/rpc/bmc_test.go index 82f03a2..b1c3af9 100644 --- a/grpc/rpc/bmc_test.go +++ b/grpc/rpc/bmc_test.go @@ -42,7 +42,7 @@ func setup() { Ctx: ctx, } - tr = taskrunner.NewRunner(repo, 100, 100, time.Second) + tr = taskrunner.NewRunner(repo, 100, time.Second) tr.Start(ctx) bmcService = BmcService{ TaskRunner: tr, diff --git a/grpc/rpc/machine.go b/grpc/rpc/machine.go index d09837b..69c1afa 100644 --- a/grpc/rpc/machine.go +++ b/grpc/rpc/machine.go @@ -54,7 +54,7 @@ func (m *MachineService) BootDevice(ctx context.Context, in *v1.DeviceRequest) ( defer cancel() return mbd.BootDeviceSet(taskCtx, in.BootDevice.String(), in.Persistent, in.EfiBoot) } - go m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) + m.TaskRunner.Execute(ctx, l, "setting boot device", taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.DeviceResponse{TaskId: taskID}, nil } @@ -64,14 +64,14 @@ func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.Po l := logging.ExtractLogr(ctx) taskID := xid.New().String() l = l.WithValues("taskID", taskID, "bmcIP", in.GetAuthn().GetDirectAuthn().GetHost().GetHost()) - /*l.Info( + l.Info( "start Power request", "username", in.GetAuthn().GetDirectAuthn().GetUsername(), "vendor", in.Vendor.GetName(), "powerAction", in.GetPowerAction().String(), "softTimeout", in.SoftTimeout, "OffDuration", in.OffDuration, - )*/ + ) execFunc := func(s chan string) (string, error) { mp, err := machine.NewPowerSetter( @@ -89,7 +89,7 @@ func (m *MachineService) Power(ctx context.Context, in *v1.PowerRequest) (*v1.Po defer cancel() return mp.PowerSet(taskCtx, in.PowerAction.String()) } - go m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) + m.TaskRunner.Execute(ctx, l, "power action: "+in.GetPowerAction().String(), taskID, in.GetAuthn().GetDirectAuthn().GetHost().GetHost(), execFunc) return &v1.PowerResponse{TaskId: taskID}, nil } diff --git a/grpc/rpc/task_test.go b/grpc/rpc/task_test.go index 46f11ed..f9b4faa 100644 --- a/grpc/rpc/task_test.go +++ b/grpc/rpc/task_test.go @@ -34,9 +34,10 @@ func TestTaskFound(t *testing.T) { t.Fatalf("expected no error, got: %v", err) } - time.Sleep(time.Second) + time.Sleep(time.Second * 3) taskReq := &v1.StatusRequest{TaskId: resp.TaskId} taskResp, _ := taskService.Status(context.Background(), taskReq) + t.Logf("Got response: %+v", taskResp) if taskResp.Id != resp.TaskId { t.Fatalf("got: %+v", taskResp) } diff --git a/grpc/server.go b/grpc/server.go index c6d7ac7..82fc72d 100644 --- a/grpc/server.go +++ b/grpc/server.go @@ -2,11 +2,9 @@ package grpc import ( "context" - "fmt" "net" "os" "os/signal" - "syscall" "time" "github.com/go-logr/logr" @@ -42,9 +40,6 @@ type Server struct { maxWorkers int // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. workerIdleTimeout time.Duration - // maxIngestionWorkers is the maximum number of concurrent workers that will be allowed. - // These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues. - maxIngestionWorkers int } // ServerOption for setting optional values. @@ -77,12 +72,6 @@ func WithWorkerIdleTimeout(t time.Duration) ServerOption { return func(args *Server) { args.workerIdleTimeout = t } } -// WithMaxIngestionWorkers sets the max number of concurrent workers that will be allowed. -// These are the workers that handle ingesting tasks from RPC endpoints and writing them to the map of per Host ID queues. -func WithMaxIngestionWorkers(max int) ServerOption { - return func(args *Server) { args.maxIngestionWorkers = max } -} - // RunServer registers all services and runs the server. func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, port string, httpServer *http.Server, opts ...ServerOption) error { ctx, cancel := context.WithCancel(ctx) @@ -97,19 +86,17 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } defaultServer := &Server{ - Actions: repo, - bmcTimeout: oob.DefaultBMCTimeout, - maxWorkers: 1000, - workerIdleTimeout: time.Second * 30, - maxIngestionWorkers: 1000, + Actions: repo, + bmcTimeout: oob.DefaultBMCTimeout, + maxWorkers: 1000, + workerIdleTimeout: time.Second * 30, } for _, opt := range opts { opt(defaultServer) } - fmt.Printf("maxWorkers: %d\n", defaultServer.maxWorkers) - tr := taskrunner.NewRunner(repo, defaultServer.maxIngestionWorkers, defaultServer.maxWorkers, defaultServer.workerIdleTimeout) + tr := taskrunner.NewRunner(repo, defaultServer.maxWorkers, defaultServer.workerIdleTimeout) tr.Start(ctx) ms := rpc.MachineService{ @@ -165,16 +152,6 @@ func RunServer(ctx context.Context, log logr.Logger, grpcServer *grpc.Server, po } }() - msgChan := make(chan os.Signal) - signal.Notify(msgChan, syscall.SIGUSR1) - go func() { - for range msgChan { - fmt.Println("======") - tr.Print() - fmt.Println("======") - } - }() - go func() { <-ctx.Done() log.Info("ctx cancelled, shutting down PBnJ") diff --git a/grpc/taskrunner/queue.go b/grpc/taskrunner/queue.go deleted file mode 100644 index 0f490bc..0000000 --- a/grpc/taskrunner/queue.go +++ /dev/null @@ -1,128 +0,0 @@ -package taskrunner - -import ( - "context" - - "github.com/adrianbrad/queue" - "github.com/go-logr/logr" -) - -type IngestQueue struct { - q *queue.Blocking[*Task] -} - -type Task struct { - ID string `json:"id"` - Host string `json:"host"` - Description string `json:"description"` - Action func(chan string) (string, error) `json:"-"` - Log logr.Logger `json:"-"` -} - -func NewFIFOChannelQueue() *IngestQueue { - return &IngestQueue{ - q: queue.NewBlocking([]*Task{}), - } -} - -// Enqueue inserts the item into the queue. -func (i *IngestQueue) Enqueue2(item Task) { - i.q.OfferWait(&item) -} - -// Dequeue removes the oldest element from the queue. FIFO. -func (i *IngestQueue) Dequeue2(ctx context.Context, tChan chan Task) { - for { - select { - case <-ctx.Done(): - return - default: - item, err := i.q.Get() - if err != nil { - continue - } - tChan <- *item - } - } -} - -func NewIngestQueue() *IngestQueue { - return &IngestQueue{ - q: queue.NewBlocking([]*Task{}), - } -} - -// Enqueue inserts the item into the queue. -func (i *IngestQueue) Enqueue(item Task) { - i.q.OfferWait(&item) -} - -// Dequeue removes the oldest element from the queue. FIFO. -func (i *IngestQueue) Dequeue() (Task, error) { - item, err := i.q.Get() - if err != nil { - return Task{}, err - } - - return *item, nil -} - -func (i *IngestQueue) Size() int { - return i.q.Size() -} - -func newHostQueue() *hostQueue { - return &hostQueue{ - q: queue.NewBlocking[host]([]host{}), - ch: make(chan host), - } -} - -type host string - -func (h host) String() string { - return string(h) -} - -type hostQueue struct { - q *queue.Blocking[host] - ch chan host -} - -// Enqueue inserts the item into the queue. -func (i *hostQueue) Enqueue(item host) { - i.q.OfferWait(item) -} - -// Dequeue removes the oldest element from the queue. FIFO. -func (i *hostQueue) Dequeue() (host, error) { - item, err := i.q.Get() - if err != nil { - return "", err - } - - return item, nil -} - -// Dequeue removes the oldest element from the queue. FIFO. -func (i *hostQueue) Dequeue2(ctx context.Context) <-chan host { - go func() { - for { - select { - case <-ctx.Done(): - return - default: - item, err := i.q.Get() - if err != nil { - continue - } - i.ch <- item - } - } - }() - return i.ch -} - -func (i *hostQueue) Size() int { - return i.q.Size() -} diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index c3585eb..2dc8d7e 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -13,36 +13,15 @@ type orchestrator struct { workers sync.Map manager *concurrencyManager workerIdleTimeout time.Duration - - ingestManager *concurrencyManager - - fifoQueue *hostQueue - fifoChan chan host - ingestionQueue *IngestQueue + fifoChan chan string // perIDQueue is a map of hostID to a channel of tasks. perIDQueue sync.Map - - //testing new stuff ingestChan chan Task } -func (r *Runner) Print() { - one := r.orchestrator.ingestionQueue.Size() - two := r.orchestrator.fifoQueue.Size() - var three int - r.orchestrator.perIDQueue.Range(func(key, value interface{}) bool { - three++ - return true - }) - fmt.Printf("ingestion queue size: %d\n", one) - fmt.Printf("fcfs queue size: %d\n", two) - fmt.Printf("perID queue size: %d\n", three) -} - // ingest take a task off the ingestion queue and puts it on the perID queue // and adds the host ID to the fcfs queue. func (r *Runner) ingest(ctx context.Context) { - //func (o *orchestrator) ingest(ctx context.Context) { // dequeue from ingestion queue // enqueue to perID queue // enqueue to fcfs queue @@ -81,7 +60,6 @@ func (r *Runner) orchestrate(ctx context.Context) { // 2. start workers for { time.Sleep(time.Second * 2) - // r.orchestrator.perIDQueue.Range(func(key, value interface{}) bool { r.orchestrator.workers.Range(func(key, value interface{}) bool { // if worker id exists in o.workers, then move on because the worker is already running. if value.(bool) { @@ -92,7 +70,10 @@ func (r *Runner) orchestrate(ctx context.Context) { r.orchestrator.manager.Wait() r.orchestrator.workers.Store(key.(string), true) - v, _ := r.orchestrator.perIDQueue.Load(key.(string)) + v, found := r.orchestrator.perIDQueue.Load(key.(string)) + if !found { + return false + } go r.worker(ctx, key.(string), v.(chan Task)) return true }) @@ -101,25 +82,20 @@ func (r *Runner) orchestrate(ctx context.Context) { func (r *Runner) worker(ctx context.Context, hostID string, q chan Task) { defer r.orchestrator.manager.Done() - defer func() { - r.orchestrator.workers.Range(func(key, value interface{}) bool { - if key.(string) == hostID { //nolint:forcetypeassert // good - r.orchestrator.workers.Delete(key.(string)) - return true //nolint:revive // this is needed to satisfy the func parameter - } - return true //nolint:revive // this is needed to satisfy the func parameter - }) - - }() + defer r.orchestrator.workers.Delete(hostID) for { select { case <-ctx.Done(): + // TODO: check queue length before returning maybe? + // For 175000 tasks, i found there would occasionally be 1 or 2 that didnt get processed. + // still seemed to be in the queue/chan. return case t := <-q: r.process(ctx, t.Log, t.Description, t.ID, t.Action) metrics.PerIDQueue.WithLabelValues(hostID).Dec() case <-time.After(r.orchestrator.workerIdleTimeout): + // TODO: check queue length returning maybe? return } } diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index bc88a4d..c15269e 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -2,7 +2,6 @@ package taskrunner import ( "context" - "fmt" "net" "net/url" "sync" @@ -25,6 +24,14 @@ type Runner struct { orchestrator *orchestrator } +type Task struct { + ID string `json:"id"` + Host string `json:"host"` + Description string `json:"description"` + Action func(chan string) (string, error) `json:"-"` + Log logr.Logger `json:"-"` +} + // NewRunner returns a task runner that manages tasks, workers, queues, and persistence. // // maxIngestionWorkers is the maximum number of concurrent workers that will be allowed. @@ -33,19 +40,15 @@ type Runner struct { // maxWorkers is the maximum number of concurrent workers that will be allowed to handle bmc tasks. // // workerIdleTimeout is the idle timeout for workers. If no tasks are received within the timeout, the worker will exit. -func NewRunner(repo repository.Actions, maxIngestionWorkers, maxWorkers int, workerIdleTimeout time.Duration) *Runner { - fmt.Println("NewRunner", maxIngestionWorkers, maxWorkers, workerIdleTimeout) +func NewRunner(repo repository.Actions, maxWorkers int, workerIdleTimeout time.Duration) *Runner { o := &orchestrator{ - workers: sync.Map{}, - fifoQueue: newHostQueue(), - fifoChan: make(chan host, 5000), - ingestionQueue: NewIngestQueue(), - ingestManager: newManager(maxIngestionWorkers), + workers: sync.Map{}, + fifoChan: make(chan string, 10000), // perIDQueue is a map of hostID to a channel of tasks. perIDQueue: sync.Map{}, manager: newManager(maxWorkers), workerIdleTimeout: workerIdleTimeout, - ingestChan: make(chan Task, 5000), + ingestChan: make(chan Task, 10000), } return &Runner{ @@ -96,7 +99,6 @@ func (r *Runner) Execute(_ context.Context, l logr.Logger, description, taskID, Log: l, } - //r.orchestrator.ingestionQueue.Enqueue(i) r.orchestrator.ingestChan <- i metrics.IngestionQueue.Inc() metrics.Ingested.Inc() @@ -155,7 +157,6 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t cctx, done := context.WithCancel(ctx) defer done() go r.updateMessages(cctx, taskID, messagesChan) - //logger.Info("worker start") resultRecord := repository.Record{ State: "complete", @@ -179,7 +180,6 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t if errors.As(err, &foundErr) { resultRecord.Error = foundErr.StructuredError() } - //logger.Error(err, "task completed with an error") } record, err := r.Repository.Get(taskID) if err != nil { @@ -192,10 +192,8 @@ func (r *Runner) process(ctx context.Context, logger logr.Logger, description, t record.Error = resultRecord.Error if err := r.Repository.Update(taskID, record); err != nil { - //logger.Error(err, "failed to update record") + logger.Error(err, "failed to update record") } - - //logger.Info("worker complete", "complete", true) } // Status returns the status record of a task. diff --git a/grpc/taskrunner/taskrunner_test.go b/grpc/taskrunner/taskrunner_test.go index eab9352..e299db8 100644 --- a/grpc/taskrunner/taskrunner_test.go +++ b/grpc/taskrunner/taskrunner_test.go @@ -26,7 +26,7 @@ func TestRoundTrip(t *testing.T) { defer s.Close() repo := &persistence.GoKV{Store: s, Ctx: ctx} logger := logr.Discard() - runner := NewRunner(repo, 100, 100, time.Second) + runner := NewRunner(repo, 100, time.Second) runner.Start(ctx) time.Sleep(time.Millisecond * 100) @@ -39,7 +39,7 @@ func TestRoundTrip(t *testing.T) { }) // must be min of 3 because we sleep 2 seconds in worker function to allow final status messages to be written - time.Sleep(500 * time.Millisecond) + time.Sleep(time.Second * 2) record, err := runner.Status(ctx, taskID) if err != nil { t.Fatal(err) diff --git a/pkg/http/http.go b/pkg/http/http.go index 6ce9167..b05158e 100644 --- a/pkg/http/http.go +++ b/pkg/http/http.go @@ -3,6 +3,7 @@ package http import ( "context" "net/http" + "time" "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -35,16 +36,21 @@ func (h *Server) init() { } func (h *Server) Run(ctx context.Context) error { - svr := &http.Server{Addr: h.address, Handler: h.mux} - svr.ListenAndServe() + svr := &http.Server{ + Addr: h.address, + Handler: h.mux, + // Mitigate Slowloris attacks. 20 seconds is based on Apache's recommended 20-40 + // recommendation. Hegel doesn't really have many headers so 20s should be plenty of time. + // https://en.wikipedia.org/wiki/Slowloris_(computer_security) + ReadHeaderTimeout: 20 * time.Second, + } go func() { <-ctx.Done() - svr.Shutdown(ctx) + _ = svr.Shutdown(ctx) }() return svr.ListenAndServe() - // return http.ListenAndServe(h.address, h.mux) //nolint:gosec // TODO: add handle timeouts } func NewServer(addr string) *Server { From 73de1b207bebcde457331ce6770f13f2c8541c7c Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Sat, 9 Sep 2023 22:55:34 -0600 Subject: [PATCH 20/20] Make task channel smaller: It was too big and made memory consumption way too high. Also, the typical/general use-case will be high number of hosts and low number of tasks that are requested close together. Signed-off-by: Jacob Weinstock --- grpc/taskrunner/manager.go | 2 ++ grpc/taskrunner/run.go | 20 ++++++++++---------- grpc/taskrunner/taskrunner.go | 4 ++-- pkg/http/http.go | 6 +++++- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/grpc/taskrunner/manager.go b/grpc/taskrunner/manager.go index 77e725c..ba6df9b 100644 --- a/grpc/taskrunner/manager.go +++ b/grpc/taskrunner/manager.go @@ -1,5 +1,7 @@ package taskrunner +// copied and modified from https://github.com/zenthangplus/goccm + import "sync/atomic" type concurrencyManager struct { diff --git a/grpc/taskrunner/run.go b/grpc/taskrunner/run.go index 2dc8d7e..38abded 100644 --- a/grpc/taskrunner/run.go +++ b/grpc/taskrunner/run.go @@ -14,9 +14,9 @@ type orchestrator struct { manager *concurrencyManager workerIdleTimeout time.Duration fifoChan chan string - // perIDQueue is a map of hostID to a channel of tasks. - perIDQueue sync.Map - ingestChan chan Task + // perHostChan is a map of hostID to a channel of tasks. + perHostChan sync.Map + ingestChan chan Task } // ingest take a task off the ingestion queue and puts it on the perID queue @@ -32,8 +32,8 @@ func (r *Runner) ingest(ctx context.Context) { case t := <-r.orchestrator.ingestChan: // 2. enqueue to perID queue - ch := make(chan Task, 5000) - q, exists := r.orchestrator.perIDQueue.LoadOrStore(t.Host, ch) + ch := make(chan Task, 10) + q, exists := r.orchestrator.perHostChan.LoadOrStore(t.Host, ch) v, ok := q.(chan Task) if !ok { fmt.Println("bad type: IngestQueue") @@ -59,22 +59,22 @@ func (r *Runner) orchestrate(ctx context.Context) { // 1. dequeue from fcfs queue // 2. start workers for { - time.Sleep(time.Second * 2) + // time.Sleep(time.Second * 3) - this potential helps with ingestion r.orchestrator.workers.Range(func(key, value interface{}) bool { // if worker id exists in o.workers, then move on because the worker is already running. - if value.(bool) { + if value.(bool) { //nolint: forcetypeassert // values are always certain. return true } // wait for a worker to become available r.orchestrator.manager.Wait() - r.orchestrator.workers.Store(key.(string), true) - v, found := r.orchestrator.perIDQueue.Load(key.(string)) + r.orchestrator.workers.Store(key.(string), true) //nolint: forcetypeassert // values are always certain. + v, found := r.orchestrator.perHostChan.Load(key.(string)) if !found { return false } - go r.worker(ctx, key.(string), v.(chan Task)) + go r.worker(ctx, key.(string), v.(chan Task)) //nolint: forcetypeassert // values are always certain. return true }) } diff --git a/grpc/taskrunner/taskrunner.go b/grpc/taskrunner/taskrunner.go index c15269e..fce4975 100644 --- a/grpc/taskrunner/taskrunner.go +++ b/grpc/taskrunner/taskrunner.go @@ -44,8 +44,8 @@ func NewRunner(repo repository.Actions, maxWorkers int, workerIdleTimeout time.D o := &orchestrator{ workers: sync.Map{}, fifoChan: make(chan string, 10000), - // perIDQueue is a map of hostID to a channel of tasks. - perIDQueue: sync.Map{}, + // perHostChan is a map of hostID to a channel of tasks. + perHostChan: sync.Map{}, manager: newManager(maxWorkers), workerIdleTimeout: workerIdleTimeout, ingestChan: make(chan Task, 10000), diff --git a/pkg/http/http.go b/pkg/http/http.go index b05158e..c2f70e4 100644 --- a/pkg/http/http.go +++ b/pkg/http/http.go @@ -50,7 +50,11 @@ func (h *Server) Run(ctx context.Context) error { _ = svr.Shutdown(ctx) }() - return svr.ListenAndServe() + if err := svr.ListenAndServe(); err != nil && err != http.ErrServerClosed { + return err + } + + return nil } func NewServer(addr string) *Server {