From b3f0a95ad244d4728f92a5b816c350a7a244292e Mon Sep 17 00:00:00 2001 From: 0xnirmal Date: Mon, 23 Dec 2024 11:22:52 -0800 Subject: [PATCH] add queries + support GPU utilization in /graph-data --- apps/api/src/caching/helpers.ts | 4 +- .../src/services/db/gpuBreakdownService.ts | 163 ++++++++++++++++++ apps/api/src/services/db/statsService.ts | 11 +- 3 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/services/db/gpuBreakdownService.ts diff --git a/apps/api/src/caching/helpers.ts b/apps/api/src/caching/helpers.ts index ce8ad0ead..cdd103cf3 100644 --- a/apps/api/src/caching/helpers.ts +++ b/apps/api/src/caching/helpers.ts @@ -85,5 +85,7 @@ export const cacheKeys = { getTestnetVersion: "getTestnetVersion", getSandboxVersion: "getSandboxVersion", getGpuModels: "getGpuModels", - getTrialProviders: "getTrialProviders" + getTrialProviders: "getTrialProviders", + getGpuUtilization: "getGpuUtilization", + getGpuBreakdown: "getGpuBreakdown" }; diff --git a/apps/api/src/services/db/gpuBreakdownService.ts b/apps/api/src/services/db/gpuBreakdownService.ts new file mode 100644 index 000000000..b88b8a7b4 --- /dev/null +++ b/apps/api/src/services/db/gpuBreakdownService.ts @@ -0,0 +1,163 @@ +import { cacheKeys, cacheResponse } from "@src/caching/helpers"; +import { chainDb } from "@src/db/dbConnection"; +import { QueryTypes } from "sequelize"; + +type GpuUtilizationData = { + date: Date; + cpuUtilization: number; + cpu: number; + gpuUtilization: number; + gpu: number; + count: number; + node_count: number; +}; + +type GpuBreakdownData = { + date: Date; + vendor: string; + model: string; + providerCount: number; + nodeCount: number; + totalGpus: number; + leasedGpus: number; + gpuUtilization: number; +}; + +export async function getGpuUtilization() { + return await cacheResponse( + 60 * 5, // 5 minutes + cacheKeys.getGpuUtilization, + async () => { + const result = await chainDb.query( + `SELECT + d."date", + ROUND( + COALESCE((SUM("activeCPU") + SUM("pendingCPU")) * 100.0 / + NULLIF(SUM("activeCPU") + SUM("pendingCPU") + SUM("availableCPU"), 0), 0), + 2 + )::float AS "cpuUtilization", + COALESCE(SUM("activeCPU") + SUM("pendingCPU") + SUM("availableCPU"), 0)::integer AS "cpu", + ROUND( + COALESCE((SUM("activeGPU") + SUM("pendingGPU")) * 100.0 / + NULLIF(SUM("activeGPU") + SUM("pendingGPU") + SUM("availableGPU"), 0), 0), + 2 + )::float AS "gpuUtilization", + COALESCE(SUM("activeGPU") + SUM("pendingGPU") + SUM("availableGPU"), 0)::integer AS "gpu", + COUNT(*) as provider_count, + COALESCE(COUNT(DISTINCT "nodeId"), 0) as node_count + FROM "day" d + INNER JOIN ( + SELECT DISTINCT ON("hostUri",DATE("checkDate")) + DATE("checkDate") AS date, + ps."activeCPU", ps."pendingCPU", ps."availableCPU", + ps."activeGPU", ps."pendingGPU", ps."availableGPU", + ps."isOnline", + n.id as "nodeId" + FROM "providerSnapshot" ps + INNER JOIN "provider" ON "provider"."owner"=ps."owner" + INNER JOIN "providerSnapshotNode" n ON n."snapshotId"=ps.id AND n."gpuAllocatable" > 0 + LEFT JOIN "providerSnapshotNodeGPU" gpu ON gpu."snapshotNodeId" = n.id + WHERE ps."isLastSuccessOfDay" = TRUE + ORDER BY "hostUri",DATE("checkDate"),"checkDate" DESC + ) "dailyProviderStats" + ON DATE(d."date")="dailyProviderStats"."date" + GROUP BY d."date" + ORDER BY d."date" ASC`, + { + type: QueryTypes.SELECT + } + ); + + const stats = result.map(day => ({ + date: day.date, + value: day.gpuUtilization + })); + + return { + currentValue: stats[stats.length - 1]?.value ?? 0, + compareValue: stats[stats.length - 2]?.value ?? 0, + snapshots: stats + }; + }, + true + ); +} + +export async function getGpuBreakdownByVendorAndModel(): Promise { + return await cacheResponse( + 60 * 5, // 5 minutes + cacheKeys.getGpuBreakdown, + async () => { + const result = await chainDb.query<{ + date: Date; + vendor: string; + model: string; + provider_count: number; + node_count: number; + total_gpus: number; + leased_gpus: number; + gpuUtilization: number; + }>( + `SELECT + d."date", + COALESCE(gpu."vendor", 'Unknown') as "vendor", + COALESCE(gpu."name", 'Unknown') as "model", + COALESCE(COUNT(DISTINCT "dailyProviderStats"."hostUri"), 0) as provider_count, + COALESCE(COUNT(DISTINCT n.id), 0) as node_count, + COALESCE(COUNT(gpu.id), 0) as total_gpus, + COALESCE(CAST(ROUND(SUM( + CAST(n."gpuAllocated" as float) / + NULLIF((SELECT COUNT(*) + FROM "providerSnapshotNodeGPU" subgpu + WHERE subgpu."snapshotNodeId" = n.id), 0) + )) as int), 0) as leased_gpus, + CAST(COALESCE( + SUM( + CAST(n."gpuAllocated" as float) / + NULLIF((SELECT COUNT(*) + FROM "providerSnapshotNodeGPU" subgpu + WHERE subgpu."snapshotNodeId" = n.id), 0) + ) * 100.0 / NULLIF(COUNT(gpu.id), 0) + , 0) as numeric(10,2)) as "gpuUtilization" + FROM "day" d + INNER JOIN ( + SELECT DISTINCT ON("hostUri", DATE("checkDate")) + ps.id as "snapshotId", + "hostUri", + DATE("checkDate") AS date, + ps."isOnline" + FROM "providerSnapshot" ps + INNER JOIN "provider" ON "provider"."owner" = ps."owner" + WHERE ps."isLastSuccessOfDay" = TRUE + ORDER BY "hostUri", DATE("checkDate"), "checkDate" DESC + ) "dailyProviderStats" ON DATE(d."date") = "dailyProviderStats"."date" + INNER JOIN "providerSnapshotNode" n ON n."snapshotId" = "dailyProviderStats"."snapshotId" AND n."gpuAllocatable" > 0 + LEFT JOIN "providerSnapshotNodeGPU" gpu ON gpu."snapshotNodeId" = n.id + GROUP BY d."date", gpu."vendor", gpu."name" + ORDER BY d."date" ASC, gpu."vendor", gpu."name"`, + { + type: QueryTypes.SELECT + } + ); + + return result.map(row => ({ + date: row.date, + vendor: row.vendor, + model: row.model, + providerCount: row.provider_count, + nodeCount: row.node_count, + totalGpus: row.total_gpus, + leasedGpus: row.leased_gpus, + gpuUtilization: row.gpuUtilization + })); + }, + true + ); +} + +export async function getLatestGpuBreakdown(): Promise { + const allData = await getGpuBreakdownByVendorAndModel(); + const latestDate = allData.reduce((latest, current) => (latest > current.date ? latest : current.date), new Date(0)); + + return allData.filter(data => data.date.getTime() === latestDate.getTime()); +} diff --git a/apps/api/src/services/db/statsService.ts b/apps/api/src/services/db/statsService.ts index 36ce05ca3..b3d7988a0 100644 --- a/apps/api/src/services/db/statsService.ts +++ b/apps/api/src/services/db/statsService.ts @@ -7,6 +7,7 @@ import { cacheKeys, cacheResponse } from "@src/caching/helpers"; import { chainDb } from "@src/db/dbConnection"; import { ProviderActiveLeasesStats, ProviderStats, ProviderStatsKey } from "@src/types/graph"; import { env } from "@src/utils/env"; +import { getGpuUtilization } from "./gpuBreakdownService"; type GraphData = { currentValue: number; @@ -90,7 +91,8 @@ type AuthorizedGraphDataName = | "activeCPU" | "activeGPU" | "activeMemory" - | "activeStorage"; + | "activeStorage" + | "gpuUtilization"; export const AuthorizedGraphDataNames: AuthorizedGraphDataName[] = [ "dailyUAktSpent", @@ -105,7 +107,8 @@ export const AuthorizedGraphDataNames: AuthorizedGraphDataName[] = [ "activeCPU", "activeGPU", "activeMemory", - "activeStorage" + "activeStorage", + "gpuUtilization" ]; export function isValidGraphDataName(x: string): x is AuthorizedGraphDataName { @@ -113,8 +116,6 @@ export function isValidGraphDataName(x: string): x is AuthorizedGraphDataName { } export async function getGraphData(dataName: AuthorizedGraphDataName): Promise { - console.log("getGraphData: " + dataName); - let attributes: (keyof Block)[] = []; let isRelative = false; let getter: (block: Block) => number = null; @@ -144,6 +145,8 @@ export async function getGraphData(dataName: AuthorizedGraphDataName): Promise block.activeEphemeralStorage + block.activePersistentStorage; break; + case "gpuUtilization": + return await getGpuUtilization(); default: attributes = [dataName]; getter = (block: Block) => block[dataName];