Skip to content

Commit

Permalink
[WorkspaceStarter] Fail instance on all errors (#18745)
Browse files Browse the repository at this point in the history
* [server] Tone down EntitlementService logs: error to warn

* [server] Make sure to fail a WorkspaceInstance on any error

* [WorspaceStarter] Don't fail on temporary gRPC errors
  • Loading branch information
geropl authored Sep 20, 2023
1 parent e916bf2 commit e5d03e1
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 42 deletions.
4 changes: 4 additions & 0 deletions components/gitpod-protocol/src/util/grpc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,7 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
return new grpc.InterceptingCall(nextCall(options), requester);
};
}

export function isGrpcError(err: any): err is grpc.StatusObject {
return err.code && err.details;
}
12 changes: 6 additions & 6 deletions components/server/src/billing/entitlement-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ export class EntitlementServiceImpl implements EntitlementService {
throw new Error("Unsupported billing mode: " + (billingMode as any).mode); // safety net
}
} catch (err) {
log.error({ userId: user.id }, "EntitlementService error: mayStartWorkspace", err);
log.warn({ userId: user.id }, "EntitlementService error: mayStartWorkspace", err);
return {}; // When there is an EntitlementService error, we never want to break workspace starts
}
}
Expand All @@ -139,7 +139,7 @@ export class EntitlementServiceImpl implements EntitlementService {
return this.ubp.maySetTimeout(userId, organizationId);
}
} catch (err) {
log.error({ userId }, "EntitlementService error: maySetTimeout", err);
log.warn({ userId }, "EntitlementService error: maySetTimeout", err);
return true;
}
}
Expand All @@ -154,7 +154,7 @@ export class EntitlementServiceImpl implements EntitlementService {
return this.ubp.getDefaultWorkspaceTimeout(userId, organizationId);
}
} catch (err) {
log.error({ userId }, "EntitlementService error: getDefaultWorkspaceTimeout", err);
log.warn({ userId }, "EntitlementService error: getDefaultWorkspaceTimeout", err);
return WORKSPACE_TIMEOUT_DEFAULT_LONG;
}
}
Expand All @@ -169,7 +169,7 @@ export class EntitlementServiceImpl implements EntitlementService {
return this.ubp.getDefaultWorkspaceLifetime(userId, organizationId);
}
} catch (err) {
log.error({ userId }, "EntitlementService error: getDefaultWorkspaceLifetime", err);
log.warn({ userId }, "EntitlementService error: getDefaultWorkspaceLifetime", err);
return WORKSPACE_LIFETIME_LONG;
}
}
Expand All @@ -188,7 +188,7 @@ export class EntitlementServiceImpl implements EntitlementService {
return this.ubp.limitNetworkConnections(userId, organizationId);
}
} catch (err) {
log.error({ userId }, "EntitlementService error: limitNetworkConnections", err);
log.warn({ userId }, "EntitlementService error: limitNetworkConnections", err);
return false;
}
}
Expand All @@ -208,7 +208,7 @@ export class EntitlementServiceImpl implements EntitlementService {
return billingMode.paid ? "paid" : "free";
}
} catch (err) {
log.error({ userId }, "EntitlementService error: getBillingTier", err);
log.warn({ userId }, "EntitlementService error: getBillingTier", err);
return "paid";
}
}
Expand Down
1 change: 1 addition & 0 deletions components/server/src/prometheus-metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ export type FailedInstanceStartReason =
| "clusterSelectionFailed"
| "startOnClusterFailed"
| "imageBuildFailed"
| "imageBuildFailedUser"
| "resourceExhausted"
| "workspaceClusterMaintenance"
| "other";
Expand Down
5 changes: 1 addition & 4 deletions components/server/src/workspace/workspace-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ import { goDurationToHumanReadable } from "@gitpod/gitpod-protocol/lib/util/time
import { HeadlessLogEndpoint, HeadlessLogService } from "./headless-log-service";
import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
import { OrganizationService } from "../orgs/organization-service";
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";

export interface StartWorkspaceOptions extends StarterStartWorkspaceOptions {
/**
Expand Down Expand Up @@ -901,10 +902,6 @@ export class WorkspaceService {

// TODO(gpl) Make private after FGA rollout
export function mapGrpcError(err: Error): Error {
function isGrpcError(err: any): err is grpc.StatusObject {
return err.code && err.details;
}

if (!isGrpcError(err)) {
return err;
}
Expand Down
66 changes: 34 additions & 32 deletions components/server/src/workspace/workspace-starter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ import { EnvVarService, ResolvedEnvVars } from "../user/env-var-service";
import { RedlockAbortSignal } from "redlock";
import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server";
import { ConfigProvider } from "./config-provider";
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";

export interface StartWorkspaceOptions extends GitpodServer.StartWorkspaceOptions {
excludeFeatureFlags?: NamedWorkspaceFeatureFlag[];
Expand Down Expand Up @@ -559,6 +560,7 @@ export class WorkspaceStarter {
additionalAuth,
forceRebuild,
forceRebuild,
abortSignal,
region,
);

Expand All @@ -579,23 +581,23 @@ export class WorkspaceStarter {
startRequest.setSpec(spec);
startRequest.setServicePrefix(workspace.id);

if (instance.status.phase === "pending") {
// due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
const workspaceAlreadyExists = await this.existsWithWsManager(ctx, instance);
if (workspaceAlreadyExists) {
log.debug(
{ instanceId: instance.id, workspaceId: instance.workspaceId },
"workspace already exists, not starting again",
{ phase: instance.status.phase },
);
return;
}
}

// choose a cluster and start the instance
let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
let retries = 0;
try {
if (instance.status.phase === "pending") {
// due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
const workspaceAlreadyExists = await this.existsWithWsManager(ctx, instance);
if (workspaceAlreadyExists) {
log.debug(
{ instanceId: instance.id, workspaceId: instance.workspaceId },
"workspace already exists, not starting again",
{ phase: instance.status.phase },
);
return;
}
}

for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
if (abortSignal.aborted) {
return;
Expand Down Expand Up @@ -659,6 +661,14 @@ export class WorkspaceStarter {
});
}
} catch (err) {
if (isGrpcError(err) && (err.code === grpc.status.UNAVAILABLE || err.code === grpc.status.ALREADY_EXISTS)) {
// fall-through: we don't want to fail but retry/wait for future updates to resolve this
} else if (!(err instanceof StartInstanceError)) {
// fallback in case we did not already handle this error
await this.failInstanceStart({ span }, err, workspace, instance, abortSignal);
err = new StartInstanceError("other", err); // don't throw because there's nobody catching it. We just want to log/trace it.
}

this.logAndTraceStartWorkspaceError({ span }, logCtx, err);
} finally {
if (abortSignal.aborted) {
Expand Down Expand Up @@ -811,8 +821,9 @@ export class WorkspaceStarter {
// We may have never actually started the workspace which means that ws-manager-bridge never set a workspace status.
// We have to set that status ourselves.
instance.status.phase = "stopped";
instance.stoppingTime = new Date().toISOString();
instance.stoppedTime = new Date().toISOString();
const now = new Date().toISOString();
instance.stoppingTime = now;
instance.stoppedTime = now;

instance.status.conditions.failed = err.toString();
instance.status.message = `Workspace cannot be started: ${err}`;
Expand Down Expand Up @@ -1201,6 +1212,7 @@ export class WorkspaceStarter {
additionalAuth: Map<string, string>,
ignoreBaseImageresolvedAndRebuildBase: boolean = false,
forceRebuild: boolean = false,
abortSignal: RedlockAbortSignal,
region?: WorkspaceRegion,
): Promise<WorkspaceInstance> {
const span = TraceContext.startSpan("buildWorkspaceImage", ctx);
Expand Down Expand Up @@ -1302,6 +1314,7 @@ export class WorkspaceStarter {
additionalAuth,
true,
forceRebuild,
abortSignal,
region,
);
} else {
Expand Down Expand Up @@ -1338,24 +1351,8 @@ export class WorkspaceStarter {
}

// This instance's image build "failed" as well, so mark it as such.
const now = new Date().toISOString();
instance = await this.workspaceDb.trace({ span }).updateInstancePartial(instance.id, {
status: { ...instance.status, phase: "stopped", conditions: { failed: message }, message },
stoppedTime: now,
stoppingTime: now,
});

// Mark the PrebuildWorkspace as failed
await this.failPrebuildWorkspace({ span }, err, workspace);
await this.failInstanceStart({ span }, err, workspace, instance, abortSignal);

// Publish updated workspace instance
await this.publisher.publishInstanceUpdate({
workspaceID: workspace.ownerId,
instanceID: instance.id,
ownerID: workspace.ownerId,
});

TraceContext.setError({ span }, err);
const looksLikeUserError = (msg: string): boolean => {
return msg.startsWith("build failed:") || msg.includes("headless task failed:");
};
Expand All @@ -1365,6 +1362,8 @@ export class WorkspaceStarter {
`workspace image build failed: ${message}`,
{ looksLikeUserError: true },
);
err = new StartInstanceError("imageBuildFailedUser", err);
// Don't report this as "failed" to our metrics as it would trigger an alert
} else {
log.error(
{ instanceId: instance.id, userId: user.id, workspaceId: workspace.id },
Expand Down Expand Up @@ -1963,6 +1962,9 @@ export class WorkspaceStarter {
await client.describeWorkspace(ctx, req);
return true;
} catch (err) {
if (isClusterMaintenanceError(err)) {
throw err;
}
return false;
}
}
Expand Down

0 comments on commit e5d03e1

Please sign in to comment.