Skip to content

Commit

Permalink
refactor use common retry alg between scheduler and rust requests
Browse files Browse the repository at this point in the history
  • Loading branch information
BillCarsonFr committed Feb 23, 2024
1 parent fd0b03a commit 4dc469b
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 102 deletions.
4 changes: 2 additions & 2 deletions spec/unit/rust-crypto/OutgoingRequestProcessor.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,9 @@ describe("OutgoingRequestProcessor", () => {

await expect(requestPromise).rejects.toThrow();

// Should have ultimately made 4 requests (1 initial + 3 retries)
// Should have ultimately made 5 requests (1 initial + 4 retries)
const calls = fetchMock.calls(expectedPath);
expect(calls).toHaveLength(4);
expect(calls).toHaveLength(5);

// The promise should have been rejected
await expect(requestPromise).rejects.toThrow();
Expand Down
63 changes: 63 additions & 0 deletions src/request-retry-utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
Copyright 2024 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

import { ConnectionError } from "./http-api";

/**
* Retries events up to 4 times (so 5 including initial call) using exponential backoff.
* This produces wait times of 2, 4, 8, and 16 seconds (30s total) after which we give up. If the
* failure was due to a rate limited request, the time specified in the error is returned.
*
* Returns -1 if the error is not retryable, or if we reach the maximum number of attempts.
*
* @param err - The error thrown by the http call
* @param attempts - The current number of attempts
* @param retryConnectionError - Whether to retry on {@link ConnectionError} (CORS, connection is down, etc.)
*/
export function calculateRetryBackoff(err: any, attempts: number, retryConnectionError: boolean = false): number {
if (attempts > 4) {
return -1; // give up
}

if (err instanceof ConnectionError && !retryConnectionError) {
return -1;
}

if (err.httpStatus && (err.httpStatus === 400 || err.httpStatus === 403 || err.httpStatus === 401)) {
// client error; no amount of retrying with save you now.
return -1;
}

if (err.name === "AbortError") {
// this is a client timeout, that is already very high 60s/80s
// we don't want to retry, as it could do it for very long
return -1;
}

// if event that we are trying to send is too large in any way then retrying won't help
if (err.name === "M_TOO_LARGE") {
return -1;
}

if (err.name === "M_LIMIT_EXCEEDED") {
const waitTime = err.data.retry_after_ms;
if (waitTime > 0) {
return waitTime;
}
}

return 1000 * Math.pow(2, attempts);
}
79 changes: 7 additions & 72 deletions src/rust-crypto/OutgoingRequestProcessor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ import {
} from "@matrix-org/matrix-sdk-crypto-wasm";

import { logger } from "../logger";
import { ConnectionError, IHttpOpts, MatrixError, MatrixHttpApi, Method } from "../http-api";
import { IHttpOpts, MatrixHttpApi, Method } from "../http-api";
import { logDuration, QueryDict, sleep } from "../utils";
import { IAuthDict, UIAuthCallback } from "../interactive-auth";
import { UIAResponse } from "../@types/uia";
import { ToDeviceMessageId } from "../@types/event";
import { calculateRetryBackoff } from "../request-retry-utils";

/**
* Common interface for all the request types returned by `OlmMachine.outgoingRequests`.
Expand All @@ -43,12 +44,6 @@ export interface OutgoingRequest {
readonly type: number;
}

// The default delay to wait before retrying a request.
const DEFAULT_RETRY_DELAY_MS = 1000;

// The http request will be retried at most 4 times if the error is retryable.
const MAX_REQUEST_RETRY_COUNT = 3;

/**
* OutgoingRequestManager: turns `OutgoingRequest`s from the rust sdk into HTTP requests
*
Expand Down Expand Up @@ -203,20 +198,14 @@ export class OutgoingRequestProcessor {
try {
return await this.rawJsonRequest(method, path, queryParams, body);
} catch (e) {
if (currentRetryCount >= MAX_REQUEST_RETRY_COUNT) {
// Max number of retries reached, rethrow the error
throw e;
}

const maybeRetryAfter = this.shouldWaitBeforeRetryingMillis(e);
if (maybeRetryAfter === undefined) {
// this error is not retryable
currentRetryCount++;
const backoff = calculateRetryBackoff(e, currentRetryCount, true);
if (backoff < 0) {
// Max number of retries reached, or error is not retryable. rethrow the error
throw e;
}

currentRetryCount++;
// wait for the specified time and then retry the request
await sleep(maybeRetryAfter);
await sleep(backoff);
// continue the loop and retry the request
}
}
Expand All @@ -239,58 +228,4 @@ export class OutgoingRequestProcessor {

return await this.http.authedRequest<string>(method, path, queryParams, body, opts);
}

/**
* Determine if a given error should be retried, and if so, how long to wait before retrying.
* If the error should not be retried, returns undefined.
*
* @param e - the error returned by the http stack
*/
private shouldWaitBeforeRetryingMillis(e: any): number | undefined {
if (e instanceof MatrixError) {
// On rate limited errors, we should retry after the rate limit has expired.
if (e.errcode === "M_LIMIT_EXCEEDED") {
return e.data.retry_after_ms ?? DEFAULT_RETRY_DELAY_MS;
}

if (e.errcode === "M_TOO_LARGE") {
// The request was too large, we should not retry.
// Could be a 502 or 413 status code as per documentation.
return undefined;
}
}

if (e.httpStatus && this.canRetry(e.httpStatus)) {
return DEFAULT_RETRY_DELAY_MS;
}

// Notice that client timeout errors are not ConnectionErrors, they would be `AbortError`.
// Client timeout (AbortError) errors are not retried, the default timout is already
// very high (using browser defaults e.g. 300 or 90 seconds).
if (e instanceof ConnectionError) {
return DEFAULT_RETRY_DELAY_MS;
}

// don't retry
return;
}

/**
* Returns true if the request should be retried, false otherwise.
*
* Retrying the request after a delay might succeed when the server issue
* is resolved or when the rate limit is reset.
* @param httpStatus - the HTTP status code of the response
*/
private canRetry(httpStatus: number): boolean {
// Too Many Requests
if (httpStatus === 429) return true;

// 5xx Errors (Bad Gateway, Service Unavailable, Internal Server Error ...)
// This includes gateway timeout (504) and it's ok because all the requests made here are idempotent.
// * All key/signature uploads are idempotent.
// * Room message and to-device send requests are idempotent because of txn_id.
// * Keys claim in worst case will claim several keys but won't cause harm.
return httpStatus >= 500 && httpStatus < 600;
}
}
32 changes: 4 additions & 28 deletions src/scheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ import { logger } from "./logger";
import { MatrixEvent } from "./models/event";
import { EventType } from "./@types/event";
import { defer, IDeferred, removeElement } from "./utils";
import { ConnectionError, MatrixError } from "./http-api";
import { MatrixError } from "./http-api";
import { ISendEventResponse } from "./@types/requests";
import { calculateRetryBackoff } from "./request-retry-utils";

const DEBUG = false; // set true to enable console logging.

Expand All @@ -43,38 +44,13 @@ type ProcessFunction<T> = (event: MatrixEvent) => Promise<T>;
// eslint-disable-next-line camelcase
export class MatrixScheduler<T = ISendEventResponse> {
/**
* Retries events up to 4 times using exponential backoff. This produces wait
* times of 2, 4, 8, and 16 seconds (30s total) after which we give up. If the
* failure was due to a rate limited request, the time specified in the error is
* waited before being retried.
* Default retry algorithm for the matrix scheduler. Retries events up to 4 times with exponential backoff.
* @param attempts - Number of attempts that have been made, including the one that just failed (ie. starting at 1)
* @see retryAlgorithm
*/
// eslint-disable-next-line @typescript-eslint/naming-convention
public static RETRY_BACKOFF_RATELIMIT(event: MatrixEvent | null, attempts: number, err: MatrixError): number {
if (err.httpStatus === 400 || err.httpStatus === 403 || err.httpStatus === 401) {
// client error; no amount of retrying with save you now.
return -1;
}
if (err instanceof ConnectionError) {
return -1;
}

// if event that we are trying to send is too large in any way then retrying won't help
if (err.name === "M_TOO_LARGE") {
return -1;
}

if (err.name === "M_LIMIT_EXCEEDED") {
const waitTime = err.data.retry_after_ms;
if (waitTime > 0) {
return waitTime;
}
}
if (attempts > 4) {
return -1; // give up
}
return 1000 * Math.pow(2, attempts);
return calculateRetryBackoff(err, attempts, false);
}

/**
Expand Down

0 comments on commit 4dc469b

Please sign in to comment.