Skip to content

Commit

Permalink
work in progress to support GCP H100's
Browse files Browse the repository at this point in the history
  • Loading branch information
williamstein committed Jan 28, 2025
1 parent b289626 commit 20a75e5
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 39 deletions.
4 changes: 2 additions & 2 deletions src/packages/frontend/compute/action.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { Alert, Button, Modal, Popconfirm, Popover, Spin } from "antd";
import { useEffect, useState } from "react";

import { redux, useStore } from "@cocalc/frontend/app-framework";
import { A, CopyToClipBoard, Icon } from "@cocalc/frontend/components";
import ShowError from "@cocalc/frontend/components/error";
Expand All @@ -13,6 +12,7 @@ import {
ACTION_INFO,
STATE_INFO,
getTargetState,
getArchitecture,
} from "@cocalc/util/db-schema/compute-servers";
import { computeServerAction, getApiKey } from "./api";
import costPerHour from "./cost";
Expand Down Expand Up @@ -62,7 +62,7 @@ export default function getActions({
if (configuration.cloud != "google-cloud") {
continue;
}
if (configuration.machineType.startsWith("t2a-")) {
if (getArchitecture(configuration) == "arm64") {
// TODO: suspend/resume breaks the clock badly on ARM64, and I haven't
// figured out a workaround, so don't support it for now. I guess this
// is a GCP bug.
Expand Down
90 changes: 56 additions & 34 deletions src/packages/frontend/compute/google-cloud-config.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ import type {
} from "@cocalc/util/db-schema/compute-servers";
import { reloadImages, useImages, useGoogleImages } from "./images-hook";
import { GOOGLE_CLOUD_DEFAULTS } from "@cocalc/util/db-schema/compute-servers";
import { getMinDiskSizeGb } from "@cocalc/util/db-schema/compute-servers";
import {
getMinDiskSizeGb,
getArchitecture,
} from "@cocalc/util/db-schema/compute-servers";
import {
Alert,
Button,
Expand Down Expand Up @@ -301,9 +304,7 @@ export default function GoogleCloudConfiguration({
!!(configuration.acceleratorType && configuration.acceleratorCount)
}
googleImages={googleImages}
arch={
configuration.machineType?.startsWith("t2a-") ? "arm64" : "x86_64"
}
arch={getArchitecture(configuration)}
/>
),
},
Expand Down Expand Up @@ -937,25 +938,23 @@ function Zone({ priceData, setConfig, configuration, disabled }) {

function MachineType({ priceData, setConfig, configuration, disabled, state }) {
const [archType, setArchType] = useState<"x86_64" | "arm64">(
configuration.machineType?.startsWith("t2a-") ? "arm64" : "x86_64",
getArchitecture(configuration),
);
const [sortByPrice, setSortByPrice] = useState<boolean>(true);
const [newMachineType, setNewMachineType] = useState<string>(
configuration.machineType ?? "",
);
useEffect(() => {
setNewMachineType(configuration.machineType);
setArchType(
configuration.machineType?.startsWith("t2a-") ? "arm64" : "x86_64",
);
setArchType(getArchitecture(configuration));
}, [configuration.machineType]);
useEffect(() => {
if (archType == "arm64" && !configuration.machineType.startsWith("t2a-")) {
if (archType == "arm64" && getArchitecture(configuration) != "arm64") {
setNewMachineType("t2a-standard-4");
setConfig({ machineType: "t2a-standard-4" });
return;
}
if (archType == "x86_64" && configuration.machineType.startsWith("t2a-")) {
if (archType == "x86_64" && getArchitecture(configuration) == "arm64") {
setNewMachineType("t2d-standard-4");
setConfig({ machineType: "t2d-standard-4" });
return;
Expand All @@ -967,13 +966,13 @@ function MachineType({ priceData, setConfig, configuration, disabled, state }) {
.filter((machineType) => {
const { acceleratorType } = configuration;
if (!acceleratorType) {
if (machineType.startsWith("g2-") || machineType.startsWith("a2-")) {
if (machineType.startsWith("g") || machineType.startsWith("a")) {
return false;
}
if (archType == "arm64" && !machineType.startsWith("t2a-")) {
if (archType == "arm64" && getArchitecture(configuration) != "arm64") {
return false;
}
if (archType == "x86_64" && machineType.startsWith("t2a-")) {
if (archType == "x86_64" && getArchitecture(configuration) == "arm64") {
return false;
}
} else {
Expand Down Expand Up @@ -1052,8 +1051,8 @@ function MachineType({ priceData, setConfig, configuration, disabled, state }) {
(state ?? "deprovisioned") != "deprovisioned"
? "Can only be changed when machine is deprovisioned"
: archType == "x86_64"
? "Intel or AMD X86_64 architecture machines"
: "ARM64 architecture machines"
? "Intel or AMD X86_64 architecture machines"
: "ARM64 architecture machines"
}
>
<Radio.Group
Expand Down Expand Up @@ -1161,6 +1160,9 @@ const ACCELERATOR_TYPES = [
"nvidia-l4",
"nvidia-tesla-a100",
"nvidia-a100-80gb",
"nvidia-h100-80gb",
"nvidia-h200-141gb",
// these are too hard to properly keep software image for:
// "nvidia-tesla-v100",
//"nvidia-tesla-p100",
//"nvidia-tesla-p4",
Expand Down Expand Up @@ -1188,12 +1190,16 @@ function GPU({
<div style={{ color: "#666", marginBottom: "5px" }}>
<b>
<Icon style={{ float: "right", fontSize: "50px" }} name="gpu" />
<Icon name="cube" /> NVIDIA GPU:{" "}
<A href="https://www.nvidia.com/en-us/data-center/a100/">A100</A>,{" "}
<A href="https://www.nvidia.com/en-us/data-center/l4/">L4</A>,{" "}
<A href="https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/solutions/resources/documents1/Datasheet_NVIDIA_T4_Virtualization.pdf">
T4
</A>
<Icon name="cube" /> NVIDIA GPU{" "}
<div style={{ float: "right" }}>
<A href="https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/solutions/resources/documents1/Datasheet_NVIDIA_T4_Virtualization.pdf">
T4
</A>
, <A href="https://www.nvidia.com/en-us/data-center/l4/">L4</A>,{" "}
<A href="https://www.nvidia.com/en-us/data-center/a100/">A100</A>,{" "}
<A href="https://www.nvidia.com/en-us/data-center/h100/">H100</A>,{" "}
<A href="https://www.nvidia.com/en-us/data-center/h200/">H200</A>
</div>
</b>
</div>
);
Expand Down Expand Up @@ -1377,23 +1383,16 @@ function ensureConsistentConfiguration(
) {
const newConfiguration = { ...configuration, ...changes };
const newChanges = { ...changes };

ensureConsistentImage(newConfiguration, newChanges, IMAGES);

ensureConsistentAccelerator(priceData, newConfiguration, newChanges);

ensureConsistentNvidiaL4andA100(priceData, newConfiguration, newChanges);

ensureConsistentZoneWithRegion(priceData, newConfiguration, newChanges);
ensureConsistentRegionAndZoneWithMachineType(
priceData,
newConfiguration,
newChanges,
);

ensureConsistentZoneWithRegion(priceData, newConfiguration, newChanges);

ensureSufficientDiskSize(newConfiguration, newChanges, IMAGES);

ensureConsistentDiskType(priceData, newConfiguration, newChanges);

return newChanges;
Expand Down Expand Up @@ -1453,13 +1452,15 @@ function ensureConsistentZoneWithRegion(priceData, configuration, changes) {
// currently changing region, so set a zone that matches the region
for (const zone in priceData.zones) {
if (zone.startsWith(configuration.region)) {
changes["zone"] = zone;
configuration["zone"] = changes["zone"] = zone;
break;
}
}
} else {
// probably changing the zone, so set the region from the zone
changes["region"] = zoneToRegion(configuration.zone);
configuration["region"] = changes["region"] = zoneToRegion(
configuration.zone,
);
}
}

Expand Down Expand Up @@ -1562,13 +1563,17 @@ function ensureZoneIsConsistentWithGPU(priceData, configuration, changes) {
}
}

// The Nvidia L4 and A100 are a little different
// The Nvidia L4 and A100 are a little different, etc.
function ensureConsistentNvidiaL4andA100(priceData, configuration, changes) {
const { machineType, acceleratorType } = configuration;

// L4 or A100 GPU machine type, but switching to no GPU, so we have
// to change the machine type
if (machineType.startsWith("g2-") || machineType.startsWith("a2-")) {
if (
machineType.startsWith("g2-") ||
machineType.startsWith("a2-") ||
machineType.startsWith("a3-")
) {
if (!acceleratorType) {
// Easy case -- the user is explicitly changing the GPU from being set
// to NOT be set, and the GPU is L4 or A100. In this case,
Expand All @@ -1579,6 +1584,8 @@ function ensureConsistentNvidiaL4andA100(priceData, configuration, changes) {
}
}
if (
acceleratorType != "nvidia-h200-141gb" &&
acceleratorType != "nvidia-h100-80gb" &&
acceleratorType != "nvidia-tesla-a100" &&
acceleratorType != "nvidia-a100-80gb" &&
acceleratorType != "nvidia-l4"
Expand All @@ -1602,6 +1609,21 @@ function ensureConsistentNvidiaL4andA100(priceData, configuration, changes) {
priceData.accelerators[acceleratorType]?.machineType[
configuration.acceleratorCount
];

if (machineTypes == null) {
// maybe 1 gpu isn't allowed, e.g., with H200
const machineType = priceData.accelerators[acceleratorType]?.machineType;
if (machineType != null) {
for (const count in machineType) {
configuration.acceleratorCount = changes.acceleratorCount =
parseInt(count);
machineTypes =
priceData.accelerators[acceleratorType]?.machineType[
configuration.acceleratorCount
];
}
}
}
}
if (machineTypes == null) {
throw Error("bug -- this can't happen");
Expand All @@ -1626,7 +1648,7 @@ function ensureConsistentRegionAndZoneWithMachineType(
const machineType = configuration["machineType"];
if (priceData.machineTypes[machineType] == null) {
console.warn(
`BUG -- This should never happen: unknonwn machineType = '${machineType}'`,
`BUG -- This should never happen: unknown machineType = '${machineType}'`,
);
// invalid machineType
if (configuration.acceleratorType) {
Expand Down
3 changes: 2 additions & 1 deletion src/packages/server/compute/cloud/google-cloud/images.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import type {
GoogleCloudImage,
GoogleCloudImages,
} from "@cocalc/util/db-schema/compute-servers";
import { getArchitecture as getArchitecture0 } from "@cocalc/util/db-schema/compute-servers";
import { makeValidGoogleName } from "@cocalc/util/db-schema/compute-servers";
import { cmp } from "@cocalc/util/misc";
import { getGoogleCloudImagePrefix } from "./index";
Expand Down Expand Up @@ -158,7 +159,7 @@ export async function deleteImage(name: string) {
}

export function getArchitecture(machineType: string): Architecture {
return machineType.startsWith("t2a-") ? "arm64" : "x86_64";
return getArchitecture0({ machineType, cloud: "google-cloud" } as any);
}

export async function getSourceImage({
Expand Down
2 changes: 1 addition & 1 deletion src/packages/util/compute/cloud/clouds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ export const GOOGLE_CLOUD_DEFAULTS = {
machineType: "n2d-highmem-2",
spot: GCLOUD_SPOT_DEFAULT,
diskSizeGb: 10,
diskType: "pd-balanced",
diskType: "pd-standard",
externalIp: true,
excludeFromSync: DEFAULT_EXCLUDE_FROM_SYNC,
},
Expand Down
25 changes: 24 additions & 1 deletion src/packages/util/db-schema/compute-servers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,30 @@ for (const state of ORDERED_STATES) {
n += 1;
}

export function getArchitecture(configuration: Configuration): Architecture {
if (configuration.cloud == "onprem") {
return configuration.arch ?? "x86_64";
}
if (configuration.cloud != "google-cloud") {
// no ARM outside of GCP right now
return "x86_64";
}
const { machineType } = configuration;
const v = machineType.split("-");
if (v[0].endsWith("a")) {
// The known machines with are are: t2a-, c4a-
// Everything else ends with a number or d.
// Hopefully this pattern persists.
return "arm64";
}
return "x86_64";
}

function supportsSuspend(configuration: Configuration) {
if (configuration.cloud != "google-cloud") {
return false;
}
if (configuration.machineType.startsWith("t2a-")) {
if (getArchitecture(configuration) != "x86_64") {
// TODO: suspend/resume breaks the clock badly on ARM64, and I haven't
// figured out a workaround, so don't support it for now. I guess this
// is a GCP bug.
Expand Down Expand Up @@ -699,6 +718,8 @@ interface FluidStackConfiguration extends BaseConfiguration {
os: string;
}
export type GoogleCloudAcceleratorType =
| "nvidia-h200-141gb"
| "nvidia-h100-80gb"
| "nvidia-a100-80gb"
| "nvidia-tesla-a100"
| "nvidia-l4"
Expand All @@ -708,6 +729,8 @@ export type GoogleCloudAcceleratorType =
| "nvidia-tesla-p100";

export const GOOGLE_CLOUD_ACCELERATOR_TYPES: GoogleCloudAcceleratorType[] = [
"nvidia-h200-141gb",
"nvidia-h100-80gb",
"nvidia-a100-80gb",
"nvidia-tesla-a100",
"nvidia-l4",
Expand Down

0 comments on commit 20a75e5

Please sign in to comment.