From 2d1f9f6534d11342e58a6858ace2c20d4885d4d1 Mon Sep 17 00:00:00 2001 From: emranemran Date: Mon, 10 Jun 2024 12:45:06 -0700 Subject: [PATCH] uploader: update exponential backoff timeouts uploader: update exponential backoff timeouts In the current architecture, catalyst-uploader instances are launched to upload each segment. During any given time, we can have multiple pids running where each instance attempts to write to s3 storage. If there's an outage on the storage provider, the exponential backoff retry logic kicks in and attempts to retry uploads. When multiple instances of catalyst-uploader are running, the retries tend to happen at roughly the same time in short burts leading us to quickly hit the kernel pthread_create limits. When this happens, the pods become CPU/mem bound eventually and pods may stop responding. To reduce the impact of this, the following changes are being made: * reduce # of retries from 7 to 4 * set initial interval to 30s to space out the retry attempts * set max interval to 2min to space out even further Note that this reduces the probability of running into the same issue and is not a true fix. A proper fix would require a rearchitecture of how catalyst-uploader works in conjunction with Mist. --- core/uploader.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/uploader.go b/core/uploader.go index 48dfc56..8af34ec 100644 --- a/core/uploader.go +++ b/core/uploader.go @@ -29,15 +29,15 @@ func (bc *ByteCounter) Write(p []byte) (n int, err error) { func newExponentialBackOffExecutor() *backoff.ExponentialBackOff { backOff := backoff.NewExponentialBackOff() - backOff.InitialInterval = 10 * time.Second - backOff.MaxInterval = 1 * time.Minute + backOff.InitialInterval = 30 * time.Second + backOff.MaxInterval = 2 * time.Minute backOff.MaxElapsedTime = 0 // don't impose a timeout as part of the retries return backOff } func UploadRetryBackoff() backoff.BackOff { - return backoff.WithMaxRetries(newExponentialBackOffExecutor(), 7) + return backoff.WithMaxRetries(newExponentialBackOffExecutor(), 4) } const segmentWriteTimeout = 5 * time.Minute