diff --git a/spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts b/spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts index 557be8070e1..e63243d2913 100644 --- a/spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts +++ b/spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts @@ -326,9 +326,9 @@ describe("OutgoingRequestProcessor", () => { await expect(requestPromise).rejects.toThrow(); - // Should have ultimately made 4 requests (1 initial + 3 retries) + // Should have ultimately made 5 requests (1 initial + 4 retries) const calls = fetchMock.calls(expectedPath); - expect(calls).toHaveLength(4); + expect(calls).toHaveLength(5); // The promise should have been rejected await expect(requestPromise).rejects.toThrow(); diff --git a/src/request-retry-utils.ts b/src/request-retry-utils.ts new file mode 100644 index 00000000000..1ec2940dede --- /dev/null +++ b/src/request-retry-utils.ts @@ -0,0 +1,63 @@ +/* +Copyright 2024 The Matrix.org Foundation C.I.C. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +import { ConnectionError } from "./http-api"; + +/** + * Retries events up to 4 times (so 5 including initial call) using exponential backoff. + * This produces wait times of 2, 4, 8, and 16 seconds (30s total) after which we give up. If the + * failure was due to a rate limited request, the time specified in the error is returned. + * + * Returns -1 if the error is not retryable, or if we reach the maximum number of attempts. + * + * @param err - The error thrown by the http call + * @param attempts - The current number of attempts + * @param retryConnectionError - Whether to retry on {@link ConnectionError} (CORS, connection is down, etc.) + */ +export function calculateRetryBackoff(err: any, attempts: number, retryConnectionError: boolean = false): number { + if (attempts > 4) { + return -1; // give up + } + + if (err instanceof ConnectionError && !retryConnectionError) { + return -1; + } + + if (err.httpStatus && (err.httpStatus === 400 || err.httpStatus === 403 || err.httpStatus === 401)) { + // client error; no amount of retrying with save you now. + return -1; + } + + if (err.name === "AbortError") { + // this is a client timeout, that is already very high 60s/80s + // we don't want to retry, as it could do it for very long + return -1; + } + + // if event that we are trying to send is too large in any way then retrying won't help + if (err.name === "M_TOO_LARGE") { + return -1; + } + + if (err.name === "M_LIMIT_EXCEEDED") { + const waitTime = err.data.retry_after_ms; + if (waitTime > 0) { + return waitTime; + } + } + + return 1000 * Math.pow(2, attempts); +} diff --git a/src/rust-crypto/OutgoingRequestProcessor.ts b/src/rust-crypto/OutgoingRequestProcessor.ts index c93b53de9f0..26c6124006f 100644 --- a/src/rust-crypto/OutgoingRequestProcessor.ts +++ b/src/rust-crypto/OutgoingRequestProcessor.ts @@ -27,11 +27,12 @@ import { } from "@matrix-org/matrix-sdk-crypto-wasm"; import { logger } from "../logger"; -import { ConnectionError, IHttpOpts, MatrixError, MatrixHttpApi, Method } from "../http-api"; +import { IHttpOpts, MatrixHttpApi, Method } from "../http-api"; import { logDuration, QueryDict, sleep } from "../utils"; import { IAuthDict, UIAuthCallback } from "../interactive-auth"; import { UIAResponse } from "../@types/uia"; import { ToDeviceMessageId } from "../@types/event"; +import { calculateRetryBackoff } from "../request-retry-utils"; /** * Common interface for all the request types returned by `OlmMachine.outgoingRequests`. @@ -43,12 +44,6 @@ export interface OutgoingRequest { readonly type: number; } -// The default delay to wait before retrying a request. -const DEFAULT_RETRY_DELAY_MS = 1000; - -// The http request will be retried at most 4 times if the error is retryable. -const MAX_REQUEST_RETRY_COUNT = 3; - /** * OutgoingRequestManager: turns `OutgoingRequest`s from the rust sdk into HTTP requests * @@ -203,20 +198,14 @@ export class OutgoingRequestProcessor { try { return await this.rawJsonRequest(method, path, queryParams, body); } catch (e) { - if (currentRetryCount >= MAX_REQUEST_RETRY_COUNT) { - // Max number of retries reached, rethrow the error - throw e; - } - - const maybeRetryAfter = this.shouldWaitBeforeRetryingMillis(e); - if (maybeRetryAfter === undefined) { - // this error is not retryable + currentRetryCount++; + const backoff = calculateRetryBackoff(e, currentRetryCount, true); + if (backoff < 0) { + // Max number of retries reached, or error is not retryable. rethrow the error throw e; } - - currentRetryCount++; // wait for the specified time and then retry the request - await sleep(maybeRetryAfter); + await sleep(backoff); // continue the loop and retry the request } } @@ -239,58 +228,4 @@ export class OutgoingRequestProcessor { return await this.http.authedRequest(method, path, queryParams, body, opts); } - - /** - * Determine if a given error should be retried, and if so, how long to wait before retrying. - * If the error should not be retried, returns undefined. - * - * @param e - the error returned by the http stack - */ - private shouldWaitBeforeRetryingMillis(e: any): number | undefined { - if (e instanceof MatrixError) { - // On rate limited errors, we should retry after the rate limit has expired. - if (e.errcode === "M_LIMIT_EXCEEDED") { - return e.data.retry_after_ms ?? DEFAULT_RETRY_DELAY_MS; - } - - if (e.errcode === "M_TOO_LARGE") { - // The request was too large, we should not retry. - // Could be a 502 or 413 status code as per documentation. - return undefined; - } - } - - if (e.httpStatus && this.canRetry(e.httpStatus)) { - return DEFAULT_RETRY_DELAY_MS; - } - - // Notice that client timeout errors are not ConnectionErrors, they would be `AbortError`. - // Client timeout (AbortError) errors are not retried, the default timout is already - // very high (using browser defaults e.g. 300 or 90 seconds). - if (e instanceof ConnectionError) { - return DEFAULT_RETRY_DELAY_MS; - } - - // don't retry - return; - } - - /** - * Returns true if the request should be retried, false otherwise. - * - * Retrying the request after a delay might succeed when the server issue - * is resolved or when the rate limit is reset. - * @param httpStatus - the HTTP status code of the response - */ - private canRetry(httpStatus: number): boolean { - // Too Many Requests - if (httpStatus === 429) return true; - - // 5xx Errors (Bad Gateway, Service Unavailable, Internal Server Error ...) - // This includes gateway timeout (504) and it's ok because all the requests made here are idempotent. - // * All key/signature uploads are idempotent. - // * Room message and to-device send requests are idempotent because of txn_id. - // * Keys claim in worst case will claim several keys but won't cause harm. - return httpStatus >= 500 && httpStatus < 600; - } } diff --git a/src/scheduler.ts b/src/scheduler.ts index 41612f1c902..557abe84a45 100644 --- a/src/scheduler.ts +++ b/src/scheduler.ts @@ -22,8 +22,9 @@ import { logger } from "./logger"; import { MatrixEvent } from "./models/event"; import { EventType } from "./@types/event"; import { defer, IDeferred, removeElement } from "./utils"; -import { ConnectionError, MatrixError } from "./http-api"; +import { MatrixError } from "./http-api"; import { ISendEventResponse } from "./@types/requests"; +import { calculateRetryBackoff } from "./request-retry-utils"; const DEBUG = false; // set true to enable console logging. @@ -43,38 +44,13 @@ type ProcessFunction = (event: MatrixEvent) => Promise; // eslint-disable-next-line camelcase export class MatrixScheduler { /** - * Retries events up to 4 times using exponential backoff. This produces wait - * times of 2, 4, 8, and 16 seconds (30s total) after which we give up. If the - * failure was due to a rate limited request, the time specified in the error is - * waited before being retried. + * Default retry algorithm for the matrix scheduler. Retries events up to 4 times with exponential backoff. * @param attempts - Number of attempts that have been made, including the one that just failed (ie. starting at 1) * @see retryAlgorithm */ // eslint-disable-next-line @typescript-eslint/naming-convention public static RETRY_BACKOFF_RATELIMIT(event: MatrixEvent | null, attempts: number, err: MatrixError): number { - if (err.httpStatus === 400 || err.httpStatus === 403 || err.httpStatus === 401) { - // client error; no amount of retrying with save you now. - return -1; - } - if (err instanceof ConnectionError) { - return -1; - } - - // if event that we are trying to send is too large in any way then retrying won't help - if (err.name === "M_TOO_LARGE") { - return -1; - } - - if (err.name === "M_LIMIT_EXCEEDED") { - const waitTime = err.data.retry_after_ms; - if (waitTime > 0) { - return waitTime; - } - } - if (attempts > 4) { - return -1; // give up - } - return 1000 * Math.pow(2, attempts); + return calculateRetryBackoff(err, attempts, false); } /**