refactor use common retry alg between scheduler and rust requests

matrix-org · Feb 23, 2024 · 4dc469b · 4dc469b
1 parent fd0b03a
commit 4dc469b
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 102 deletions.
diff --git a/spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts b/spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts
@@ -326,9 +326,9 @@ describe("OutgoingRequestProcessor", () => {
 
                     await expect(requestPromise).rejects.toThrow();
 
-                    // Should have ultimately made 4 requests (1 initial + 3 retries)
+                    // Should have ultimately made 5 requests (1 initial + 4 retries)
                     const calls = fetchMock.calls(expectedPath);
-                    expect(calls).toHaveLength(4);
+                    expect(calls).toHaveLength(5);
 
                     // The promise should have been rejected
                     await expect(requestPromise).rejects.toThrow();

diff --git a/src/request-retry-utils.ts b/src/request-retry-utils.ts
@@ -0,0 +1,63 @@
+/*
+Copyright 2024 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import { ConnectionError } from "./http-api";
+
+/**
+ * Retries events up to 4 times (so 5 including initial call) using exponential backoff.
+ * This produces wait times of 2, 4, 8, and 16 seconds (30s total) after which we give up. If the
+ * failure was due to a rate limited request, the time specified in the error is returned.
+ *
+ * Returns -1 if the error is not retryable, or if we reach the maximum number of attempts.
+ *
+ * @param err - The error thrown by the http call
+ * @param attempts - The current number of attempts
+ * @param retryConnectionError - Whether to retry on {@link ConnectionError} (CORS, connection is down, etc.)
+ */
+export function calculateRetryBackoff(err: any, attempts: number, retryConnectionError: boolean = false): number {
+    if (attempts > 4) {
+        return -1; // give up
+    }
+
+    if (err instanceof ConnectionError && !retryConnectionError) {
+        return -1;
+    }
+
+    if (err.httpStatus && (err.httpStatus === 400 || err.httpStatus === 403 || err.httpStatus === 401)) {
+        // client error; no amount of retrying with save you now.
+        return -1;
+    }
+
+    if (err.name === "AbortError") {
+        // this is a client timeout, that is already very high 60s/80s
+        // we don't want to retry, as it could do it for very long
+        return -1;
+    }
+
+    // if event that we are trying to send is too large in any way then retrying won't help
+    if (err.name === "M_TOO_LARGE") {
+        return -1;
+    }
+
+    if (err.name === "M_LIMIT_EXCEEDED") {
+        const waitTime = err.data.retry_after_ms;
+        if (waitTime > 0) {
+            return waitTime;
+        }
+    }
+
+    return 1000 * Math.pow(2, attempts);
+}
diff --git a/src/rust-crypto/OutgoingRequestProcessor.ts b/src/rust-crypto/OutgoingRequestProcessor.ts
@@ -27,11 +27,12 @@ import {
 } from "@matrix-org/matrix-sdk-crypto-wasm";
 
 import { logger } from "../logger";
-import { ConnectionError, IHttpOpts, MatrixError, MatrixHttpApi, Method } from "../http-api";
+import { IHttpOpts, MatrixHttpApi, Method } from "../http-api";
 import { logDuration, QueryDict, sleep } from "../utils";
 import { IAuthDict, UIAuthCallback } from "../interactive-auth";
 import { UIAResponse } from "../@types/uia";
 import { ToDeviceMessageId } from "../@types/event";
+import { calculateRetryBackoff } from "../request-retry-utils";
 
 /**
  * Common interface for all the request types returned by `OlmMachine.outgoingRequests`.
@@ -43,12 +44,6 @@ export interface OutgoingRequest {
     readonly type: number;
 }
 
-// The default delay to wait before retrying a request.
-const DEFAULT_RETRY_DELAY_MS = 1000;
-
-// The http request will be retried at most 4 times if the error is retryable.
-const MAX_REQUEST_RETRY_COUNT = 3;
-
 /**
  * OutgoingRequestManager: turns `OutgoingRequest`s from the rust sdk into HTTP requests
  *
@@ -203,20 +198,14 @@ export class OutgoingRequestProcessor {
             try {
                 return await this.rawJsonRequest(method, path, queryParams, body);
             } catch (e) {
-                if (currentRetryCount >= MAX_REQUEST_RETRY_COUNT) {
-                    // Max number of retries reached, rethrow the error
-                    throw e;
-                }
-
-                const maybeRetryAfter = this.shouldWaitBeforeRetryingMillis(e);
-                if (maybeRetryAfter === undefined) {
-                    // this error is not retryable
+                currentRetryCount++;
+                const backoff = calculateRetryBackoff(e, currentRetryCount, true);
+                if (backoff < 0) {
+                    // Max number of retries reached, or error is not retryable. rethrow the error
                     throw e;
                 }
-
-                currentRetryCount++;
                 // wait for the specified time and then retry the request
-                await sleep(maybeRetryAfter);
+                await sleep(backoff);
                 // continue the loop and retry the request
             }
         }
@@ -239,58 +228,4 @@ export class OutgoingRequestProcessor {
 
         return await this.http.authedRequest<string>(method, path, queryParams, body, opts);
     }
-
-    /**
-     * Determine if a given error should be retried, and if so, how long to wait before retrying.
-     * If the error should not be retried, returns undefined.
-     *
-     * @param e - the error returned by the http stack
-     */
-    private shouldWaitBeforeRetryingMillis(e: any): number | undefined {
-        if (e instanceof MatrixError) {
-            // On rate limited errors, we should retry after the rate limit has expired.
-            if (e.errcode === "M_LIMIT_EXCEEDED") {
-                return e.data.retry_after_ms ?? DEFAULT_RETRY_DELAY_MS;
-            }
-
-            if (e.errcode === "M_TOO_LARGE") {
-                // The request was too large, we should not retry.
-                // Could be a 502 or 413 status code as per documentation.
-                return undefined;
-            }
-        }
-
-        if (e.httpStatus && this.canRetry(e.httpStatus)) {
-            return DEFAULT_RETRY_DELAY_MS;
-        }
-
-        // Notice that client timeout errors are not ConnectionErrors, they would be `AbortError`.
-        // Client timeout (AbortError) errors are not retried, the default timout is already
-        // very high (using browser defaults e.g. 300 or 90 seconds).
-        if (e instanceof ConnectionError) {
-            return DEFAULT_RETRY_DELAY_MS;
-        }
-
-        // don't retry
-        return;
-    }
-
-    /**
-     * Returns true if the request should be retried, false otherwise.
-     *
-     * Retrying the request after a delay might succeed when the server issue
-     * is resolved or when the rate limit is reset.
-     * @param httpStatus - the HTTP status code of the response
-     */
-    private canRetry(httpStatus: number): boolean {
-        // Too Many Requests
-        if (httpStatus === 429) return true;
-
-        // 5xx Errors (Bad Gateway, Service Unavailable, Internal Server Error ...)
-        // This includes gateway timeout (504) and it's ok because all the requests made here are idempotent.
-        //  * All key/signature uploads are idempotent.
-        //  * Room message and to-device send requests are idempotent because of txn_id.
-        //  * Keys claim in worst case will claim several keys but won't cause harm.
-        return httpStatus >= 500 && httpStatus < 600;
-    }
 }
diff --git a/src/scheduler.ts b/src/scheduler.ts
@@ -22,8 +22,9 @@ import { logger } from "./logger";
 import { MatrixEvent } from "./models/event";
 import { EventType } from "./@types/event";
 import { defer, IDeferred, removeElement } from "./utils";
-import { ConnectionError, MatrixError } from "./http-api";
+import { MatrixError } from "./http-api";
 import { ISendEventResponse } from "./@types/requests";
+import { calculateRetryBackoff } from "./request-retry-utils";
 
 const DEBUG = false; // set true to enable console logging.
 
@@ -43,38 +44,13 @@ type ProcessFunction<T> = (event: MatrixEvent) => Promise<T>;
 // eslint-disable-next-line camelcase
 export class MatrixScheduler<T = ISendEventResponse> {
     /**
-     * Retries events up to 4 times using exponential backoff. This produces wait
-     * times of 2, 4, 8, and 16 seconds (30s total) after which we give up. If the
-     * failure was due to a rate limited request, the time specified in the error is
-     * waited before being retried.
+     * Default retry algorithm for the matrix scheduler. Retries events up to 4 times with exponential backoff.
      * @param attempts - Number of attempts that have been made, including the one that just failed (ie. starting at 1)
      * @see retryAlgorithm
      */
     // eslint-disable-next-line @typescript-eslint/naming-convention
     public static RETRY_BACKOFF_RATELIMIT(event: MatrixEvent | null, attempts: number, err: MatrixError): number {
-        if (err.httpStatus === 400 || err.httpStatus === 403 || err.httpStatus === 401) {
-            // client error; no amount of retrying with save you now.
-            return -1;
-        }
-        if (err instanceof ConnectionError) {
-            return -1;
-        }
-
-        // if event that we are trying to send is too large in any way then retrying won't help
-        if (err.name === "M_TOO_LARGE") {
-            return -1;
-        }
-
-        if (err.name === "M_LIMIT_EXCEEDED") {
-            const waitTime = err.data.retry_after_ms;
-            if (waitTime > 0) {
-                return waitTime;
-            }
-        }
-        if (attempts > 4) {
-            return -1; // give up
-        }
-        return 1000 * Math.pow(2, attempts);
+        return calculateRetryBackoff(err, attempts, false);
     }
 
     /**