From bdc7ec461f518a872d070c287231023b13f2f5b7 Mon Sep 17 00:00:00 2001 From: Ivan Milchev Date: Thu, 16 May 2024 13:57:40 +0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20remove=20dead=20providers=20from?= =?UTF-8?q?=20coordinator=20(#4010)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🐛 remove dead providers from coordinator Signed-off-by: Ivan Milchev * log heartbeat errors Signed-off-by: Ivan Milchev --------- Signed-off-by: Ivan Milchev --- providers/coordinator.go | 8 ++++++ providers/coordinator_test.go | 54 +++++++++++++++++++++++++++++++++++ providers/running_provider.go | 2 ++ providers/runtime.go | 5 ++++ 4 files changed, 69 insertions(+) diff --git a/providers/coordinator.go b/providers/coordinator.go index 9c9fb475a9..e29e939826 100644 --- a/providers/coordinator.go +++ b/providers/coordinator.go @@ -211,6 +211,14 @@ func (c *coordinator) RemoveRuntime(runtime *Runtime) { log.Warn().Err(err).Str("provider", p.Name).Msg("failed to shut down provider") } } + } else { + // Check for killed/crashed providers and remove them from the list of running providers + for _, p := range c.runningByID { + if p.isCloseOrShutdown() { + log.Warn().Str("provider", p.Name).Msg("removing closed provider") + delete(c.runningByID, p.ID) + } + } } // If all providers have been killed, reset the connection IDs back to 0 diff --git a/providers/coordinator_test.go b/providers/coordinator_test.go index 70b0770034..51692a8d0a 100644 --- a/providers/coordinator_test.go +++ b/providers/coordinator_test.go @@ -149,6 +149,60 @@ func TestRemoveRuntime_StopUnusedProvider(t *testing.T) { assert.Empty(t, c.runningByID) } +func TestRemoveRuntime_RemoveDeadProvider(t *testing.T) { + ctrl := gomock.NewController(t) + + // Setup 1 closed provider with 1 runtime + mockPlugin1 := NewMockProviderPlugin(ctrl) + p1 := &RunningProvider{ + ID: "provider1", + Plugin: mockPlugin1, + isClosed: true, + } + r1 := &Runtime{ + providers: map[string]*ConnectedProvider{ + "provider1": {Instance: p1}, + }, + Provider: &ConnectedProvider{ + Instance: p1, + Connection: &pp.ConnectRes{ + Asset: &inventory.Asset{PlatformIds: []string{"platformId1"}}, + }, + }, + } + + // Setup another provider with another runtime + r2 := &Runtime{ + providers: map[string]*ConnectedProvider{ + "provider2": {Instance: p1}, + }, + Provider: &ConnectedProvider{ + Instance: p1, + Connection: &pp.ConnectRes{ + Asset: &inventory.Asset{PlatformIds: []string{"platformId2"}}, + }, + }, + } + + c := &coordinator{ + runningByID: map[string]*RunningProvider{ + "provider1": p1, + }, + runtimes: map[string]*Runtime{ + "platformId1": r1, + "platformId2": r2, + }, + } + + // Remove 1 runtime + c.RemoveRuntime(r1) + + // Verify that the provider has been removed because it crashed + assert.Empty(t, c.runningByID) + + c.RemoveRuntime(r2) +} + func TestRemoveRuntime_UsedProvider(t *testing.T) { ctrl := gomock.NewController(t) diff --git a/providers/running_provider.go b/providers/running_provider.go index be5088d276..94c26577ed 100644 --- a/providers/running_provider.go +++ b/providers/running_provider.go @@ -38,6 +38,7 @@ type RunningProvider struct { // initialize the heartbeat with the provider func (p *RunningProvider) heartbeat() error { if err := p.doOneHeartbeat(p.interval + p.gracePeriod); err != nil { + log.Error().Err(err).Str("plugin", p.Name).Msg("error in plugin heartbeat") if err := p.Shutdown(); err != nil { log.Error().Err(err).Str("plugin", p.Name).Msg("error in plugin shutdown") } @@ -47,6 +48,7 @@ func (p *RunningProvider) heartbeat() error { go func() { for !p.isCloseOrShutdown() { if err := p.doOneHeartbeat(p.interval + p.gracePeriod); err != nil { + log.Error().Err(err).Str("plugin", p.Name).Msg("error in plugin heartbeat") if err := p.Shutdown(); err != nil { log.Error().Err(err).Str("plugin", p.Name).Msg("error in plugin shutdown") } diff --git a/providers/runtime.go b/providers/runtime.go index b56c77c522..e8a2b712b5 100644 --- a/providers/runtime.go +++ b/providers/runtime.go @@ -204,6 +204,11 @@ func (r *Runtime) Connect(req *plugin.ConnectReq) error { runtime: r, } + // TODO: we probably want to check here if the provider is dead and restart it + // if r.Provider.Instance.isCloseOrShutdown() { + + // } + r.Provider.Connection, r.Provider.ConnectionError = r.Provider.Instance.Plugin.Connect(req, &callbacks) if r.Provider.ConnectionError != nil { return r.Provider.ConnectionError