From d77fae7d2e42416279ec7db72b02bfeb2998703c Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Fri, 25 Oct 2024 16:54:12 +0530 Subject: [PATCH] feat: add tests for processing of health checks and also fix a bug Signed-off-by: Manan Gupta --- go/vt/discovery/keyspace_events.go | 20 ++- go/vt/discovery/keyspace_events_test.go | 228 ++++++++++++++++++++++++ 2 files changed, 239 insertions(+), 9 deletions(-) diff --git a/go/vt/discovery/keyspace_events.go b/go/vt/discovery/keyspace_events.go index 8332e99679b..4a82b173e9b 100644 --- a/go/vt/discovery/keyspace_events.go +++ b/go/vt/discovery/keyspace_events.go @@ -381,17 +381,19 @@ func (kss *keyspaceState) onHealthCheck(th *TabletHealth) { sstate.serving = true case !th.Serving: sstate.serving = false - // Once we have seen a non-serving primary healthcheck, there is no need for us to explicitly wait - // for a reparent to happen. We use waitForReparent to ensure that we don't prematurely stop - // buffering when we receive a serving healthcheck from the primary that is being demoted. - // However, if we receive a non-serving check, then we know that we won't receive any more serving - // healthchecks anymore until reparent finishes. Specifically, this helps us when PRS fails, but - // stops gracefully because the new candidate couldn't get caught up in time. In this case, we promote - // the previous primary back. Without turning off waitForReparent here, we wouldn't be able to stop - // buffering for that case. - sstate.waitForReparent = false } } + if !th.Serving { + // Once we have seen a non-serving primary healthcheck, there is no need for us to explicitly wait + // for a reparent to happen. We use waitForReparent to ensure that we don't prematurely stop + // buffering when we receive a serving healthcheck from the primary that is being demoted. + // However, if we receive a non-serving check, then we know that we won't receive any more serving + // healthchecks anymore until reparent finishes. Specifically, this helps us when PRS fails, but + // stops gracefully because the new candidate couldn't get caught up in time. In this case, we promote + // the previous primary back. Without turning off waitForReparent here, we wouldn't be able to stop + // buffering for that case. + sstate.waitForReparent = false + } // if the primary for this shard has been externally reparented, we're undergoing a failover, // which is considered an availability event. update this shard to point it to the new tablet diff --git a/go/vt/discovery/keyspace_events_test.go b/go/vt/discovery/keyspace_events_test.go index 1a4c473e7cb..87ecac3355d 100644 --- a/go/vt/discovery/keyspace_events_test.go +++ b/go/vt/discovery/keyspace_events_test.go @@ -421,6 +421,234 @@ func TestWaitForConsistentKeyspaces(t *testing.T) { } } +func TestOnHealthCheck(t *testing.T) { + testcases := []struct { + name string + ss *shardState + th *TabletHealth + wantServing bool + wantWaitForReparent bool + wantExternallyReparented int64 + wantUID uint32 + }{ + { + name: "Non primary tablet health ignored", + ss: &shardState{ + serving: false, + waitForReparent: false, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_REPLICA, + }, + Serving: true, + }, + wantServing: false, + wantWaitForReparent: false, + wantExternallyReparented: 10, + wantUID: 1, + }, { + name: "Serving primary seen in non-serving shard", + ss: &shardState{ + serving: false, + waitForReparent: false, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_PRIMARY, + }, + Serving: true, + PrimaryTermStartTime: 20, + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 2, + }, + }, + }, + wantServing: true, + wantWaitForReparent: false, + wantExternallyReparented: 20, + wantUID: 2, + }, { + name: "New serving primary seen while waiting for reparent", + ss: &shardState{ + serving: false, + waitForReparent: true, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_PRIMARY, + }, + Serving: true, + PrimaryTermStartTime: 20, + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 2, + }, + }, + }, + wantServing: true, + wantWaitForReparent: false, + wantExternallyReparented: 20, + wantUID: 2, + }, { + name: "Old serving primary seen while waiting for reparent", + ss: &shardState{ + serving: false, + waitForReparent: true, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_PRIMARY, + }, + Serving: true, + PrimaryTermStartTime: 10, + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + }, + wantServing: false, + wantWaitForReparent: true, + wantExternallyReparented: 10, + wantUID: 1, + }, { + name: "Old non-serving primary seen while waiting for reparent", + ss: &shardState{ + serving: false, + waitForReparent: true, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_PRIMARY, + }, + Serving: false, + PrimaryTermStartTime: 10, + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + }, + wantServing: false, + wantWaitForReparent: false, + wantExternallyReparented: 10, + wantUID: 1, + }, { + name: "New serving primary while already serving", + ss: &shardState{ + serving: true, + waitForReparent: false, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_PRIMARY, + }, + Serving: true, + PrimaryTermStartTime: 20, + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 2, + }, + }, + }, + wantServing: true, + wantWaitForReparent: false, + wantExternallyReparented: 20, + wantUID: 2, + }, { + name: "Primary goes non serving", + ss: &shardState{ + serving: true, + waitForReparent: false, + externallyReparented: 10, + currentPrimary: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + th: &TabletHealth{ + Target: &querypb.Target{ + TabletType: topodatapb.TabletType_PRIMARY, + }, + Serving: false, + PrimaryTermStartTime: 10, + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: testCell, + Uid: 1, + }, + }, + }, + wantServing: false, + wantWaitForReparent: false, + wantExternallyReparented: 10, + wantUID: 1, + }, + } + + ksName := "ks" + shard := "-80" + kss := &keyspaceState{ + mu: sync.Mutex{}, + keyspace: ksName, + shards: make(map[string]*shardState), + } + // Adding this so that we don't run any topo calls from ensureConsistentLocked. + kss.moveTablesState = &MoveTablesState{ + Typ: MoveTablesRegular, + State: MoveTablesSwitching, + } + for _, tt := range testcases { + t.Run(tt.name, func(t *testing.T) { + kss.shards[shard] = tt.ss + tt.th.Target.Keyspace = ksName + tt.th.Target.Shard = shard + kss.onHealthCheck(tt.th) + require.Equal(t, tt.wantServing, tt.ss.serving) + require.Equal(t, tt.wantWaitForReparent, tt.ss.waitForReparent) + require.Equal(t, tt.wantExternallyReparented, tt.ss.externallyReparented) + require.Equal(t, tt.wantUID, tt.ss.currentPrimary.Uid) + }) + } +} + type fakeTopoServer struct { }