From 2135554a5d49f5069d80dd2d8d9cfe1d875e8276 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Wed, 2 Aug 2023 17:06:36 +0200 Subject: [PATCH] rpc: fix various test bugs TLDR: Prior to this patch, multiple unit tests in the `rpc` package were misconfiguring their heartbeat interval and timeout. This patch fixes this. The issue was that there were two separate locations for these tunable parameters: - `RPCHeartbeat{Interval,Timeout}` fields in `ContextOptions` (previously via `*base.Config`, recently as direct fields in `ContextOptions`) - also, two private fields `heartbeat{Interval,Timeout}` The former was copied into the latter during `NewContext()`. Only the latter (the private fields) matter for the actual behavior. Prior to this commit, the tests would call `NewContext()` and then afterwards would override the fields in `ContextOptions`. These overrides were thus ineffective because the value was already copied by that time. In addition to this design bug, there were a couple of additional bugs lurking: the overrides defined by the tests were problematic in some cases. If these overrides had been effective, the tests would break. The problem was discovered after 8d2d2bf47cb3da1c27baa140a5afb5d28a5ed8f6 merged, because in that commit some of the overrides from broken tests were taken on board and became effective. This started causing flakes. The fix is to remove the ambiguity in configuration and remove the problematic overrides from offending tests. Release note: None --- pkg/rpc/context.go | 15 +++++---------- pkg/rpc/context_test.go | 8 +------- pkg/rpc/datadriven_test.go | 1 - pkg/rpc/peer.go | 4 ++-- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/pkg/rpc/context.go b/pkg/rpc/context.go index df0986cd54e2..e18cf789bacb 100644 --- a/pkg/rpc/context.go +++ b/pkg/rpc/context.go @@ -213,9 +213,6 @@ type Context struct { RemoteClocks *RemoteClockMonitor MasterCtx context.Context // cancel on stopper quiesce - heartbeatInterval time.Duration - heartbeatTimeout time.Duration - rpcCompression bool localInternalClient RestrictedInternalClient @@ -531,13 +528,11 @@ func NewContext(ctx context.Context, opts ContextOptions) *Context { secCtx.useNodeAuth = opts.UseNodeAuth rpcCtx := &Context{ - ContextOptions: opts, - SecurityContext: secCtx, - rpcCompression: enableRPCCompression, - MasterCtx: masterCtx, - metrics: makeMetrics(), - heartbeatInterval: opts.RPCHeartbeatInterval, - heartbeatTimeout: opts.RPCHeartbeatTimeout, + ContextOptions: opts, + SecurityContext: secCtx, + rpcCompression: enableRPCCompression, + MasterCtx: masterCtx, + metrics: makeMetrics(), } rpcCtx.dialbackMu.Lock() diff --git a/pkg/rpc/context_test.go b/pkg/rpc/context_test.go index 6bce3dbb7bd8..ee69efc21c6f 100644 --- a/pkg/rpc/context_test.go +++ b/pkg/rpc/context_test.go @@ -876,9 +876,6 @@ func TestOffsetMeasurement(t *testing.T) { clientClock := &AdvancingClock{time: timeutil.Unix(0, 10)} clientMaxOffset := time.Duration(0) clientCtx := newTestContext(clusterID, clientClock, clientMaxOffset, stopper) - // Make the interval shorter to speed up the test. - clientCtx.RPCHeartbeatInterval = 1 * time.Millisecond - clientCtx.RPCHeartbeatTimeout = 1 * time.Millisecond clientCtx.RemoteClocks.offsetTTL = 5 * clientClock.getAdvancementInterval() if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(ctx); err != nil { t.Fatal(err) @@ -952,7 +949,7 @@ func TestFailedOffsetMeasurement(t *testing.T) { clientCtx := newTestContext(clusterID, clock, maxOffset, stopper) // Remove the timeout so that failure arises from exceeding the maximum // clock reading delay, not the timeout. - clientCtx.heartbeatTimeout = 0 + clientCtx.RPCHeartbeatTimeout = 0 // Allow two heartbeat for initialization. The first ping doesn't report an offset, // the second one thus doesn't have an offset to work with, so it's only on the // third one that's fully configured. @@ -1022,9 +1019,6 @@ func TestLatencyInfoCleanupOnClosedConnection(t *testing.T) { clientClock := &AdvancingClock{time: timeutil.Unix(0, 10)} clientMaxOffset := time.Duration(0) clientCtx := newTestContext(clusterID, clientClock, clientMaxOffset, stopper) - // Make the interval shorter to speed up the test. - clientCtx.RPCHeartbeatInterval = 1 * time.Millisecond - clientCtx.RPCHeartbeatTimeout = 1 * time.Millisecond var hbDecommission atomic.Value hbDecommission.Store(false) diff --git a/pkg/rpc/datadriven_test.go b/pkg/rpc/datadriven_test.go index 4ebfbdc201da..70d5c01cc343 100644 --- a/pkg/rpc/datadriven_test.go +++ b/pkg/rpc/datadriven_test.go @@ -148,7 +148,6 @@ func setupEnv(t *testing.T) *ddEnv { maxOffset := time.Duration(250) clientCtx := newTestContext(clusterID, clock, maxOffset, stopper) - clientCtx.heartbeatInterval = 10 * time.Millisecond env := &ddEnv{ clusterID: clusterID, diff --git a/pkg/rpc/peer.go b/pkg/rpc/peer.go index 5ff1372af218..e137d8ba61c4 100644 --- a/pkg/rpc/peer.go +++ b/pkg/rpc/peer.go @@ -184,8 +184,8 @@ func (rpcCtx *Context) newPeer(k peerKey) *peer { dial: func(ctx context.Context, target string, class ConnectionClass) (*grpc.ClientConn, error) { return rpcCtx.grpcDialRaw(ctx, target, class, rpcCtx.testingDialOpts...) }, - heartbeatInterval: rpcCtx.heartbeatInterval, - heartbeatTimeout: rpcCtx.heartbeatTimeout, + heartbeatInterval: rpcCtx.RPCHeartbeatInterval, + heartbeatTimeout: rpcCtx.RPCHeartbeatTimeout, } var b *circuit.Breaker