From d9f9c8aaf51f2a1b46d30c7392fec0094972fee2 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 3 Nov 2023 17:09:16 +0100 Subject: [PATCH 01/47] fix: retrigger gql api at manual refresh - solves #221 --- web/frontend/src/joblist/JobList.svelte | 2 +- web/frontend/src/joblist/Row.svelte | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/web/frontend/src/joblist/JobList.svelte b/web/frontend/src/joblist/JobList.svelte index 02caf3f4..2484f299 100644 --- a/web/frontend/src/joblist/JobList.svelte +++ b/web/frontend/src/joblist/JobList.svelte @@ -89,7 +89,7 @@ // Force refresh list with existing unchanged variables (== usually would not trigger reactivity) export function refresh() { - queryStore({ + jobs = queryStore({ client: client, query: query, variables: { paging, sorting, filter }, diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index 2117b914..6573b57d 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -64,11 +64,12 @@ variables: { id, metrics, scopes } }); - function refresh() { - queryStore({ + export function refresh() { + metricsQuery = queryStore({ client: client, query: query, - variables: { id, metrics, scopes } + variables: { id, metrics, scopes }, + // requestPolicy: 'network-only' // use default cache-first for refresh }); } From bf64fc5213729673eff19a24f5c694b4419bfe27 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 13 Nov 2023 13:43:44 +0100 Subject: [PATCH 02/47] Add completed state indicator --- web/frontend/src/joblist/JobInfo.svelte | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/web/frontend/src/joblist/JobInfo.svelte b/web/frontend/src/joblist/JobInfo.svelte index 83841c6c..b7ca32ac 100644 --- a/web/frontend/src/joblist/JobInfo.svelte +++ b/web/frontend/src/joblist/JobInfo.svelte @@ -28,6 +28,17 @@ return `${hours}:${('0' + minutes).slice(-2)}:${('0' + seconds).slice(-2)}`; } + function getStateColor(state) { + switch (state) { + case 'running': + return 'success' + case 'completed': + return 'primary' + default: + return 'danger' + } + } +
@@ -86,12 +97,7 @@

Start: {(new Date(job.startTime)).toLocaleString()}
- Duration: {formatDuration(job.duration)} - {#if job.state == 'running'} - running - {:else if job.state != 'completed'} - {job.state} - {/if} + Duration: {formatDuration(job.duration)} {job.state} {#if job.walltime}
Walltime: {formatDuration(job.walltime)} From 84d6b4835360e7435a79df5918b944be73ae9ab2 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 15 Nov 2023 15:03:58 +0100 Subject: [PATCH 03/47] Fix: default values and new option for time filter --- web/frontend/src/filters/Duration.svelte | 95 +++++++++++++++++++---- web/frontend/src/filters/Filters.svelte | 24 +++++- web/frontend/src/filters/StartTime.svelte | 36 ++++----- 3 files changed, 118 insertions(+), 37 deletions(-) diff --git a/web/frontend/src/filters/Duration.svelte b/web/frontend/src/filters/Duration.svelte index b482b9c4..ca2ce453 100644 --- a/web/frontend/src/filters/Duration.svelte +++ b/web/frontend/src/filters/Duration.svelte @@ -1,18 +1,23 @@ (isOpen = !isOpen)}> - Select Start Time + Select Job Duration -

Between

+

Duration more than

+ + +
+ +
+
h
+
+
+ + +
+ +
+
m
+
+
+ +
+
+ +

Duration less than

- +
h
@@ -46,7 +77,28 @@
- + +
+
m
+
+
+ + +
+ +

Duration between

+ + +
+ +
+
h
+
+
+ + +
+
m
@@ -57,7 +109,7 @@
- +
h
@@ -65,7 +117,7 @@
- +
m
@@ -77,19 +129,30 @@ + + dispatch('update', { lessThan, moreThan, from, to }) + }}>Reset Filter diff --git a/web/frontend/src/filters/Filters.svelte b/web/frontend/src/filters/Filters.svelte index 38d7e7a6..49eaed61 100644 --- a/web/frontend/src/filters/Filters.svelte +++ b/web/frontend/src/filters/Filters.svelte @@ -41,7 +41,7 @@ states: filterPresets.states || filterPresets.state ? [filterPresets.state].flat() : allJobStates, startTime: filterPresets.startTime || { from: null, to: null }, tags: filterPresets.tags || [], - duration: filterPresets.duration || { from: null, to: null }, + duration: filterPresets.duration || { lessThan: null, moreThan: null, from: null, to: null }, jobId: filterPresets.jobId || '', arrayJobId: filterPresets.arrayJobId || null, user: filterPresets.user || '', @@ -88,6 +88,10 @@ items.push({ tags: filters.tags }) if (filters.duration.from || filters.duration.to) items.push({ duration: { from: filters.duration.from, to: filters.duration.to } }) + if (filters.duration.lessThan) + items.push({ duration: { from: 0, to: filters.duration.lessThan } }) + if (filters.duration.moreThan) + items.push({ duration: { from: filters.duration.moreThan, to: 604800 } }) // 7 days to include special jobs with long runtimes if (filters.jobId) items.push({ jobId: { [filters.jobIdMatch]: filters.jobId } }) if (filters.arrayJobId != null) @@ -144,6 +148,10 @@ opts.push(`tag=${tag}`) if (filters.duration.from && filters.duration.to) opts.push(`duration=${filters.duration.from}-${filters.duration.to}`) + if (filters.duration.lessThan) + opts.push(`duration=0-${filters.duration.lessThan}`) + if (filters.duration.moreThan) + opts.push(`duration=${filters.duration.moreThan}-604800`) if (filters.numNodes.from && filters.numNodes.to) opts.push(`numNodes=${filters.numNodes.from}-${filters.numNodes.to}`) if (filters.numAccelerators.from && filters.numAccelerators.to) @@ -267,6 +275,18 @@ {/if} + {#if filters.duration.lessThan} + (isDurationOpen = true)}> + Duration less than {Math.floor(filters.duration.lessThan / 3600)}h:{Math.floor(filters.duration.lessThan % 3600 / 60)}m + + {/if} + + {#if filters.duration.moreThan} + (isDurationOpen = true)}> + Duration more than {Math.floor(filters.duration.moreThan / 3600)}h:{Math.floor(filters.duration.moreThan % 3600 / 60)}m + + {/if} + {#if filters.tags.length != 0} (isTagsOpen = true)}> {#each filters.tags as tagId} @@ -325,6 +345,8 @@ update()} /> diff --git a/web/frontend/src/filters/StartTime.svelte b/web/frontend/src/filters/StartTime.svelte index c89851d4..59f85134 100644 --- a/web/frontend/src/filters/StartTime.svelte +++ b/web/frontend/src/filters/StartTime.svelte @@ -1,5 +1,6 @@ @@ -73,7 +69,7 @@ on:click={() => { isOpen = false from = toRFC3339(pendingFrom) - to = toRFC3339(pendingTo, 59) + to = toRFC3339(pendingTo, '59') dispatch('update', { from, to }) }}> Close & Apply From 9689f95ea11dd910b293392a1373d0c683965338 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 16 Nov 2023 12:49:20 +0100 Subject: [PATCH 04/47] Initial implementaion --- web/frontend/src/Job.root.svelte | 16 ++- web/frontend/src/JobFootprint.svelte | 172 +++++++++++++++++++++++++++ 2 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 web/frontend/src/JobFootprint.svelte diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 93c58733..3d80916f 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -27,6 +27,7 @@ import TagManagement from "./TagManagement.svelte"; import MetricSelection from "./MetricSelection.svelte"; import StatsTable from "./StatsTable.svelte"; + import JobFootprint from "./JobFootprint.svelte"; import { getContext } from "svelte"; export let dbid; @@ -132,7 +133,9 @@ let plots = {}, jobTags, - statsTable; + statsTable, + jobFootprint; + $: document.title = $initq.fetching ? "Loading..." : $initq.error @@ -200,6 +203,17 @@ {/if} + {#if $jobMetrics.data} + {#key $jobMetrics.data} + + + + {/key} + {/if} {#if $jobMetrics.data && $initq.data} {#if $initq.data.job.concurrentJobs != null && $initq.data.job.concurrentJobs.items.length != 0} {#if authlevel > roles.manager} diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte new file mode 100644 index 00000000..748d6a41 --- /dev/null +++ b/web/frontend/src/JobFootprint.svelte @@ -0,0 +1,172 @@ + + +
+ +
+ + + + From a2c99fb56d0068cf123f3ad9996213eeae944a09 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 16 Nov 2023 15:07:17 +0100 Subject: [PATCH 05/47] Add colors based on thresholds --- web/frontend/src/JobFootprint.svelte | 50 ++++++++++++++++++---------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 748d6a41..5cd3b043 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -29,7 +29,7 @@ export let jobMetrics export let size = 200 - export let displayLegend = true + export let displayLegend = false const footprintMetrics = ['mem_used', 'mem_bw','flops_any', 'cpu_load', 'acc_utilization'] // missing: energy , move to central config before deployment @@ -56,7 +56,7 @@ console.log("MVs", meanVals) - const footprintLabels = meanVals.map((mv) => [mv.name, mv.name+' Threshold']) + const footprintLabels = meanVals.map((mv) => [mv.name, 'Threshold']) const footprintData = meanVals.map((mv) => { const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === mv.name) @@ -65,35 +65,49 @@ const levelCaution = metricConfig.caution - mv.avg const levelAlert = metricConfig.alert - mv.avg - if (levelAlert > 0) { - return [mv.avg, levelAlert] - } else if (levelCaution > 0) { - return [mv.avg, levelCaution] - } else if (levelNormal > 0) { - return [mv.avg, levelNormal] - } else { - return [mv.avg, levelPeak] + if (mv.name !== 'mem_used') { // Alert if usage is low, peak is high good usage + if (levelAlert > 0) { + return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA']} // 'hsl(0, 100%, 35%)' + } else if (levelCaution > 0) { + return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA']} // '#d5b60a' + } else if (levelNormal > 0) { + return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA']} // 'hsl(100, 100%, 35%)' + } else { + return {data: [mv.avg, levelPeak], color: ['hsl(180, 100%, 60%)', '#AAA']} // 'hsl(180, 100%, 35%)' + } + } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution + if (levelPeak > 0 && (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0)) { + return {data: [mv.avg, levelPeak], color: ['#7F00FF', '#AAA']} // '#5D3FD3' + } else if (levelAlert > 0 && (levelCaution <= 0 && levelNormal <= 0)) { + return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA']} // 'hsl(0, 100%, 35%)' + } else if (levelCaution > 0 && levelNormal <= 0) { + return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA']} // '#d5b60a' + } else { + return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA']} // 'hsl(100, 100%, 35%)' + } } }) + console.log("FPD", footprintData) + $: data = { labels: footprintLabels.flat(), datasets: [ { - backgroundColor: ['#AAA', '#777'], - data: footprintData[0] + backgroundColor: footprintData[0].color, + data: footprintData[0].data }, { - backgroundColor: ['hsl(0, 100%, 60%)', 'hsl(0, 100%, 35%)'], - data: footprintData[1] + backgroundColor: footprintData[1].color, + data: footprintData[1].data }, { - backgroundColor: ['hsl(100, 100%, 60%)', 'hsl(100, 100%, 35%)'], - data: footprintData[2] + backgroundColor: footprintData[2].color, + data: footprintData[2].data }, { - backgroundColor: ['hsl(180, 100%, 60%)', 'hsl(180, 100%, 35%)'], - data: footprintData[3] + backgroundColor: footprintData[3].color, + data: footprintData[3].data } ] } From 8bc43baf2c7c66915be94dc4e41680cbcd36933a Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 16 Nov 2023 16:45:29 +0100 Subject: [PATCH 06/47] Fix units and labels --- web/frontend/src/JobFootprint.svelte | 39 +++++++++++++++++++++------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 5cd3b043..5454e59e 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -31,7 +31,7 @@ export let size = 200 export let displayLegend = false - const footprintMetrics = ['mem_used', 'mem_bw','flops_any', 'cpu_load', 'acc_utilization'] // missing: energy , move to central config before deployment + const footprintMetrics = ['mem_used', 'mem_bw','flops_any', 'cpu_load'] // 'acc_utilization' / missing: energy , move to central config before deployment const footprintMetricConfigs = footprintMetrics.map((fm) => { return getContext('metrics')(job.cluster, fm) @@ -47,16 +47,25 @@ const meanVals = footprintMetrics.map((fm) => { let jm = jobMetrics.find((jm) => jm.name === fm) + let mv = null if (jm?.metric?.statisticsSeries) { - return {name: jm.name, scope: jm.scope, avg: round(mean(jm.metric.statisticsSeries.mean), 2)} + mv = {name: jm.name, scope: jm.scope, avg: round(mean(jm.metric.statisticsSeries.mean), 2)} } else if (jm?.metric?.series[0]) { - return {name: jm.name, scope: jm.scope, avg: jm.metric.series[0].statistics.avg} + mv = {name: jm.name, scope: jm.scope, avg: jm.metric.series[0].statistics.avg} } + + if (jm?.metric?.unit?.base) { + return {...mv, unit: jm.metric.unit.prefix + jm.metric.unit.base} + } else { + return {...mv, unit: ''} + } + }).filter( Boolean ) console.log("MVs", meanVals) - const footprintLabels = meanVals.map((mv) => [mv.name, 'Threshold']) + const footprintLabels = meanVals.map((mv) => [mv.name, 'Threshold']).flat() + const footprintUnits = meanVals.map((mv) => [mv.unit, mv.unit]).flat() const footprintData = meanVals.map((mv) => { const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === mv.name) @@ -72,11 +81,15 @@ return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA']} // '#d5b60a' } else if (levelNormal > 0) { return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA']} // 'hsl(100, 100%, 35%)' - } else { + } else if (levelPeak > 0) { return {data: [mv.avg, levelPeak], color: ['hsl(180, 100%, 60%)', '#AAA']} // 'hsl(180, 100%, 35%)' + } else { // If avg greater than configured peak: render negative diff as zero + return {data: [mv.avg, 0], color: ['hsl(180, 100%, 60%)', '#AAA']} // 'hsl(180, 100%, 35%)' } } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution - if (levelPeak > 0 && (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0)) { + if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // If avg greater than configured peak: render negative diff as zero + return {data: [mv.avg, 0], color: ['#7F00FF', '#AAA']} // '#5D3FD3' + } else if (levelPeak > 0 && (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0)) { return {data: [mv.avg, levelPeak], color: ['#7F00FF', '#AAA']} // '#5D3FD3' } else if (levelAlert > 0 && (levelCaution <= 0 && levelNormal <= 0)) { return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA']} // 'hsl(0, 100%, 35%)' @@ -91,7 +104,7 @@ console.log("FPD", footprintData) $: data = { - labels: footprintLabels.flat(), + labels: footprintLabels, datasets: [ { backgroundColor: footprintData[0].color, @@ -157,11 +170,19 @@ callbacks: { label: function(context) { const labelIndex = (context.datasetIndex * 2) + context.dataIndex; - return context.chart.data.labels[labelIndex] + ': ' + context.formattedValue; + if (context.chart.data.labels[labelIndex] === 'Threshold') { + return ' -' + context.formattedValue + ' ' + footprintUnits[labelIndex] + } else { + return ' ' + context.formattedValue + ' ' + footprintUnits[labelIndex] + } }, title: function(context) { const labelIndex = (context[0].datasetIndex * 2) + context[0].dataIndex; - return context[0].chart.data.labels[labelIndex]; + if (context[0].chart.data.labels[labelIndex] === 'Threshold') { + return 'Until ' + context[0].chart.data.labels[labelIndex] + } else { + return 'Average ' + context[0].chart.data.labels[labelIndex] + } } } } From 5acd9ece7fdfd9971430b4020808e7e81c79cb84 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 16 Nov 2023 18:31:45 +0100 Subject: [PATCH 07/47] Adds messages to footprint --- web/frontend/src/JobFootprint.svelte | 46 ++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 5454e59e..fa6b9823 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -76,31 +76,45 @@ if (mv.name !== 'mem_used') { // Alert if usage is low, peak is high good usage if (levelAlert > 0) { - return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA']} // 'hsl(0, 100%, 35%)' + return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA'], valueMessage: 'Metric strongly below recommended level!', thresholdMessage: 'Difference towards caution threshold', impact: 2} // 'hsl(0, 100%, 35%)' } else if (levelCaution > 0) { - return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA']} // '#d5b60a' + return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA'], valueMessage: 'Metric below recommended level!', thresholdMessage: 'Difference towards normal threshold', impact: 1} // '#d5b60a' } else if (levelNormal > 0) { - return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA']} // 'hsl(100, 100%, 35%)' + return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA'], valueMessage: 'Metric within recommended level!', thresholdMessage: 'Difference towards peak threshold', impact: 0} // 'hsl(100, 100%, 35%)' } else if (levelPeak > 0) { - return {data: [mv.avg, levelPeak], color: ['hsl(180, 100%, 60%)', '#AAA']} // 'hsl(180, 100%, 35%)' + return {data: [mv.avg, levelPeak], color: ['hsl(180, 100%, 60%)', '#AAA'], valueMessage: 'Metric above recommended level!', thresholdMessage: 'Difference towards maximum', impact: 0} // 'hsl(180, 100%, 35%)' } else { // If avg greater than configured peak: render negative diff as zero - return {data: [mv.avg, 0], color: ['hsl(180, 100%, 60%)', '#AAA']} // 'hsl(180, 100%, 35%)' + return {data: [mv.avg, 0], color: ['hsl(180, 100%, 60%)', '#AAA'], valueMessage: 'Metric above recommended level!', thresholdMessage: 'Maximum reached!', impact: 0} // 'hsl(180, 100%, 35%)' } } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // If avg greater than configured peak: render negative diff as zero - return {data: [mv.avg, 0], color: ['#7F00FF', '#AAA']} // '#5D3FD3' + return {data: [mv.avg, 0], color: ['#7F00FF', '#AAA'], valueMessage: 'Memory usage at maximum!', thresholdMessage: 'Maximum reached!', impact: 4} // '#5D3FD3' } else if (levelPeak > 0 && (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0)) { - return {data: [mv.avg, levelPeak], color: ['#7F00FF', '#AAA']} // '#5D3FD3' + return {data: [mv.avg, levelPeak], color: ['#7F00FF', '#AAA'], valueMessage: 'Memory usage extremely above recommended level!', thresholdMessage: 'Difference towards maximum', impact: 2} // '#5D3FD3' } else if (levelAlert > 0 && (levelCaution <= 0 && levelNormal <= 0)) { - return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA']} // 'hsl(0, 100%, 35%)' + return {data: [mv.avg, levelAlert], color: ['hsl(0, 100%, 60%)', '#AAA'], valueMessage: 'Memory usage strongly above recommended level!', thresholdMessage: 'Difference towards peak threshold', impact: 2} // 'hsl(0, 100%, 35%)' } else if (levelCaution > 0 && levelNormal <= 0) { - return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA']} // '#d5b60a' + return {data: [mv.avg, levelCaution], color: ['hsl(56, 100%, 50%)', '#AAA'], valueMessage: 'Memory usage above recommended level!', thresholdMessage: 'Difference towards alert threshold', impact: 1} // '#d5b60a' } else { - return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA']} // 'hsl(100, 100%, 35%)' + return {data: [mv.avg, levelNormal], color: ['hsl(100, 100%, 60%)', '#AAA'], valueMessage: 'Memory usage within recommended level!', thresholdMessage: 'Difference towards caution threshold', impact: 0} // 'hsl(100, 100%, 35%)' } } }) + const footprintMessages = footprintData.map((fpd) => [fpd.valueMessage, fpd.thresholdMessage]).flat() + const footprintResultSum = footprintData.map((fpd) => fpd.impact).reduce((accumulator, currentValue) => { return accumulator + currentValue }, 0) + let footprintResult = '' + + if (footprintResultSum <= 1) { + footprintResult = 'good.' + } else if (footprintResultSum > 1 && footprintResultSum <= 3) { + footprintResult = 'well.' + } else if (footprintResultSum > 3 && footprintResultSum <= 5) { + footprintResult = 'acceptable.' + } else { + footprintResult = 'bad.' + } + console.log("FPD", footprintData) $: data = { @@ -183,6 +197,14 @@ } else { return 'Average ' + context[0].chart.data.labels[labelIndex] } + }, + footer: function(context) { + const labelIndex = (context[0].datasetIndex * 2) + context[0].dataIndex; + if (context[0].chart.data.labels[labelIndex] === 'Threshold') { + return footprintMessages[labelIndex] + } else { + return footprintMessages[labelIndex] + } } } } @@ -194,6 +216,10 @@
+
+ Overall Job Performance:  Your job {job.state === 'running' ? 'performs' : 'performed'} {footprintResult} +
+ + From 8d409eed0f6e77c5d77f253fbade1a98098cee52 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 20 Nov 2023 17:53:12 +0100 Subject: [PATCH 12/47] Footprint in jobList as selectable --- web/frontend/src/Job.root.svelte | 11 +++++++--- web/frontend/src/JobFootprintBars.svelte | 9 +++++--- web/frontend/src/Jobs.root.svelte | 8 +++++--- web/frontend/src/MetricSelection.svelte | 8 ++++++++ web/frontend/src/joblist/JobList.svelte | 12 ++++++++++- web/frontend/src/joblist/Row.svelte | 26 ++++++++++++++++++++++++ 6 files changed, 64 insertions(+), 10 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 3d80916f..da09841b 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -27,7 +27,7 @@ import TagManagement from "./TagManagement.svelte"; import MetricSelection from "./MetricSelection.svelte"; import StatsTable from "./StatsTable.svelte"; - import JobFootprint from "./JobFootprint.svelte"; + import JobFootprintBars from "./JobFootprintBars.svelte"; import { getContext } from "svelte"; export let dbid; @@ -135,7 +135,7 @@ jobTags, statsTable, jobFootprint; - + $: document.title = $initq.fetching ? "Loading..." : $initq.error @@ -206,7 +206,12 @@ {#if $jobMetrics.data} {#key $jobMetrics.data} - --> + footprintMetrics.includes(jm.name))) + // console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) const footprintMetricConfigs = footprintMetrics.map((fm) => { return getContext('metrics')(job.cluster, fm) }).filter( Boolean ) // Filter only "truthy" vals, see: https://stackoverflow.com/questions/28607451/removing-undefined-values-from-array - console.log("FMCs", footprintMetricConfigs) + // console.log("FMCs", footprintMetricConfigs) // const footprintMetricThresholds = footprintMetricConfigs.map((fmc) => { // Only required if scopes smaller than node required // return {name: fmc.name, ...findThresholds(fmc, 'node', job?.subCluster ? job.subCluster : '')} // Merge 2 objects @@ -149,16 +150,18 @@ } }).filter( Boolean ) - console.log("FPD", footprintData) + // console.log("FPD", footprintData) + {#if view === 'job'} Core Metrics Footprint + {/if} {#each footprintData as fpd}
diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index 07094b87..ffad9df8 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -19,7 +19,7 @@ let filterComponent; // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the let jobList, matchedJobs = null - let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false + let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false, showFootprint let metrics = filterPresets.cluster ? ccconfig[`plot_list_selectedMetrics:${filterPresets.cluster}`] || ccconfig.plot_list_selectedMetrics : ccconfig.plot_list_selectedMetrics @@ -81,7 +81,8 @@ bind:metrics={metrics} bind:sorting={sorting} bind:matchedJobs={matchedJobs} - bind:this={jobList} /> + bind:this={jobList} + bind:showFootprint={showFootprint} /> @@ -93,4 +94,5 @@ bind:cluster={selectedCluster} configName="plot_list_selectedMetrics" bind:metrics={metrics} - bind:isOpen={isMetricsSelectionOpen} /> + bind:isOpen={isMetricsSelectionOpen} + bind:showFootprint={showFootprint}/> diff --git a/web/frontend/src/MetricSelection.svelte b/web/frontend/src/MetricSelection.svelte index 59fe2639..63101d4f 100644 --- a/web/frontend/src/MetricSelection.svelte +++ b/web/frontend/src/MetricSelection.svelte @@ -17,12 +17,14 @@ export let configName export let allMetrics = null export let cluster = null + export let showFootprint const clusters = getContext('clusters'), onInit = getContext('on-init') let newMetricsOrder = [] let unorderedMetrics = [...metrics] + let pendingShowFootprint = showFootprint || false onInit(() => { if (allMetrics == null) allMetrics = new Set() @@ -90,6 +92,8 @@ metrics = newMetricsOrder.filter(m => unorderedMetrics.includes(m)) isOpen = false + showFootprint = pendingShowFootprint ? true : false + updateConfigurationMutation({ name: cluster == null ? configName : `${configName}:${cluster}`, value: JSON.stringify(metrics) @@ -121,6 +125,10 @@ +
  • + Show Footprint +
  • +
    {#each newMetricsOrder as metric, index (metric)}
  • Job Info + {#if showFootprint} + + Job Footprint + + {/if} {#each metrics as metric (metric)} {:else if $jobs.data && $initialized} {#each $jobs.data.jobs.items as job (job)} - + {:else} diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index 6573b57d..bae86d84 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -14,16 +14,23 @@ import { Card, Spinner } from "sveltestrap"; import MetricPlot from "../plots/MetricPlot.svelte"; import JobInfo from "./JobInfo.svelte"; + import JobFootprint from "../JobFootprint.svelte"; + import JobFootprintBars from "../JobFootprintBars.svelte"; import { maxScope, checkMetricDisabled } from "../utils.js"; export let job; export let metrics; export let plotWidth; export let plotHeight = 275; + export let showFootprint; let { id } = job; let scopes = [job.numNodes == 1 ? "core" : "node"]; + function distinct(value, index, array) { + return array.indexOf(value) === index; + } + const cluster = getContext("clusters").find((c) => c.name == job.cluster); const metricConfig = getContext("metrics"); // Get all MetricConfs which include subCluster-specific settings for this job const client = getContextClient(); @@ -64,6 +71,10 @@ variables: { id, metrics, scopes } }); + $: if (showFootprint) { + metrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct) + } + export function refresh() { metricsQuery = queryStore({ client: client, @@ -122,6 +133,21 @@ {:else} + {#if showFootprint} + + + + + {/if} {#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric || i)} From f8f900151af502b73ad5454f12160a9fc13936cc Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 20 Nov 2023 18:08:33 +0100 Subject: [PATCH 13/47] Fix width, spacing, render --- web/frontend/src/JobFootprintBars.svelte | 5 ++--- web/frontend/src/joblist/JobList.svelte | 17 ++++++++++++----- web/frontend/src/joblist/Row.svelte | 14 +++++++++----- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/web/frontend/src/JobFootprintBars.svelte b/web/frontend/src/JobFootprintBars.svelte index d5ba081a..36818ef8 100644 --- a/web/frontend/src/JobFootprintBars.svelte +++ b/web/frontend/src/JobFootprintBars.svelte @@ -18,8 +18,7 @@ export let job export let jobMetrics export let view = 'job' - - // export let size = 200 + export let width = 200 const footprintMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // 'acc_utilization' / missing: energy , move to central config before deployment @@ -154,7 +153,7 @@ - + {#if view === 'job'} diff --git a/web/frontend/src/joblist/JobList.svelte b/web/frontend/src/joblist/JobList.svelte index e6acaf8d..698b9ca6 100644 --- a/web/frontend/src/joblist/JobList.svelte +++ b/web/frontend/src/joblist/JobList.svelte @@ -28,7 +28,7 @@ export let sorting = { field: "startTime", order: "DESC" }; export let matchedJobs = 0; export let metrics = ccconfig.plot_list_selectedMetrics; - export let showFootprint; + export let showFootprint = false; let itemsPerPage = ccconfig.plot_list_jobsPerPage; let page = 1; @@ -135,12 +135,19 @@ }) }; + let plotWidth = null; let tableWidth = null; let jobInfoColumnWidth = 250; - $: plotWidth = Math.floor( - (tableWidth - jobInfoColumnWidth) / metrics.length - 10 - ); + $: if (showFootprint) { + plotWidth = Math.floor( + (tableWidth - jobInfoColumnWidth) / (metrics.length + 1) - 10 + ) + } else { + plotWidth = Math.floor( + (tableWidth - jobInfoColumnWidth) / metrics.length - 10 + ) + } let headerPaddingTop = 0; stickyHeader( @@ -165,7 +172,7 @@ Job Footprint diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index bae86d84..61d8cb61 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -35,8 +35,8 @@ const metricConfig = getContext("metrics"); // Get all MetricConfs which include subCluster-specific settings for this job const client = getContextClient(); const query = gql` - query ($id: ID!, $metrics: [String!]!, $scopes: [MetricScope!]!) { - jobMetrics(id: $id, metrics: $metrics, scopes: $scopes) { + query ($id: ID!, $queryMetrics: [String!]!, $scopes: [MetricScope!]!) { + jobMetrics(id: $id, metrics: $queryMetrics, scopes: $scopes) { name scope metric { @@ -68,18 +68,21 @@ $: metricsQuery = queryStore({ client: client, query: query, - variables: { id, metrics, scopes } + variables: { id, queryMetrics, scopes } }); + let queryMetrics = null $: if (showFootprint) { - metrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct) + queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct) + } else { + queryMetrics = [...metrics] } export function refresh() { metricsQuery = queryStore({ client: client, query: query, - variables: { id, metrics, scopes }, + variables: { id, queryMetrics, scopes }, // requestPolicy: 'network-only' // use default cache-first for refresh }); } @@ -144,6 +147,7 @@ From dc860f8fd903c2a0d8f52434ac1af0d5b50aa877 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 21 Nov 2023 10:27:16 +0100 Subject: [PATCH 14/47] Handle artifacts, fix single node footprint flops --- web/frontend/src/JobFootprintBars.svelte | 82 +++++++++++++++++++++--- web/frontend/src/joblist/Row.svelte | 2 + 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/web/frontend/src/JobFootprintBars.svelte b/web/frontend/src/JobFootprintBars.svelte index 36818ef8..29486134 100644 --- a/web/frontend/src/JobFootprintBars.svelte +++ b/web/frontend/src/JobFootprintBars.svelte @@ -1,9 +1,6 @@ - + {#if view === 'job'} @@ -172,6 +226,10 @@ {:else if fpd.impact === 2} + {:else if fpd.impact === -1} + + {:else if fpd.impact === -2} + {/if} {#if fpd.impact === 4} @@ -184,6 +242,10 @@ {:else if fpd.impact === 0} + {:else if fpd.impact === -1} + + {:else if fpd.impact === -2} + {/if}
  • diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index 61d8cb61..359f263d 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -74,8 +74,10 @@ let queryMetrics = null $: if (showFootprint) { queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct) + scopes = ["node"] } else { queryMetrics = [...metrics] + scopes = [job.numNodes == 1 ? "core" : "node"] } export function refresh() { From f342a65aba12d591cbb6f1cb60aa6c09999ae1c5 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 21 Nov 2023 15:38:28 +0100 Subject: [PATCH 15/47] Adds persistance to showfootprint selection --- web/frontend/src/Jobs.root.svelte | 5 ++++- web/frontend/src/MetricSelection.svelte | 14 ++++++++++++-- web/frontend/src/joblist/JobList.svelte | 2 +- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index ffad9df8..2f2f9dc5 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -19,10 +19,13 @@ let filterComponent; // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the let jobList, matchedJobs = null - let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false, showFootprint + let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false let metrics = filterPresets.cluster ? ccconfig[`plot_list_selectedMetrics:${filterPresets.cluster}`] || ccconfig.plot_list_selectedMetrics : ccconfig.plot_list_selectedMetrics + let showFootprint = filterPresets.cluster + ? !!ccconfig[`plot_list_showFootprint:${filterPresets.cluster}`] + : !!ccconfig.plot_list_showFootprint let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null // The filterPresets are handled by the Filters component, diff --git a/web/frontend/src/MetricSelection.svelte b/web/frontend/src/MetricSelection.svelte index 63101d4f..5b54ba8b 100644 --- a/web/frontend/src/MetricSelection.svelte +++ b/web/frontend/src/MetricSelection.svelte @@ -24,7 +24,7 @@ let newMetricsOrder = [] let unorderedMetrics = [...metrics] - let pendingShowFootprint = showFootprint || false + let pendingShowFootprint = !!showFootprint onInit(() => { if (allMetrics == null) allMetrics = new Set() @@ -92,7 +92,7 @@ metrics = newMetricsOrder.filter(m => unorderedMetrics.includes(m)) isOpen = false - showFootprint = pendingShowFootprint ? true : false + showFootprint = !!pendingShowFootprint updateConfigurationMutation({ name: cluster == null ? configName : `${configName}:${cluster}`, @@ -103,6 +103,16 @@ // console.log('Error on subscription: ' + res.error) } }) + + updateConfigurationMutation({ + name: cluster == null ? 'plot_list_showFootprint' : `plot_list_showFootprint:${cluster}`, + value: JSON.stringify(showFootprint) + }).subscribe(res => { + if (res.fetching === false && res.error) { + console.log('Error on footprint subscription: ' + res.error) + throw res.error + } + }) } diff --git a/web/frontend/src/joblist/JobList.svelte b/web/frontend/src/joblist/JobList.svelte index 698b9ca6..80363616 100644 --- a/web/frontend/src/joblist/JobList.svelte +++ b/web/frontend/src/joblist/JobList.svelte @@ -28,7 +28,7 @@ export let sorting = { field: "startTime", order: "DESC" }; export let matchedJobs = 0; export let metrics = ccconfig.plot_list_selectedMetrics; - export let showFootprint = false; + export let showFootprint; let itemsPerPage = ccconfig.plot_list_jobsPerPage; let page = 1; From 6b78b4e12bc4280f9380e512413fee4b34ea256d Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 21 Nov 2023 15:38:57 +0100 Subject: [PATCH 16/47] Adds message display in jobView --- web/frontend/src/JobFootprintBars.svelte | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/web/frontend/src/JobFootprintBars.svelte b/web/frontend/src/JobFootprintBars.svelte index 29486134..c21e2b2e 100644 --- a/web/frontend/src/JobFootprintBars.svelte +++ b/web/frontend/src/JobFootprintBars.svelte @@ -262,6 +262,14 @@ />
    {/each} + {#if job?.metaData?.message} +
    +
    +
    + Note: {job.metaData.message} +
    +
    + {/if}
    From 709880ff5a074dede35ef5c9e051e4fbf0dbc3c9 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 22 Nov 2023 10:53:18 +0100 Subject: [PATCH 17/47] Use html tag for metadata message - remove old footprint version based on chartjs pie --- web/frontend/src/Job.root.svelte | 9 +- web/frontend/src/JobFootprint.svelte | 447 +++++++++++------------ web/frontend/src/JobFootprintBars.svelte | 281 -------------- web/frontend/src/joblist/Row.svelte | 9 +- 4 files changed, 223 insertions(+), 523 deletions(-) delete mode 100644 web/frontend/src/JobFootprintBars.svelte diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index da09841b..7bd40f81 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -27,7 +27,7 @@ import TagManagement from "./TagManagement.svelte"; import MetricSelection from "./MetricSelection.svelte"; import StatsTable from "./StatsTable.svelte"; - import JobFootprintBars from "./JobFootprintBars.svelte"; + import JobFootprint from "./JobFootprint.svelte"; import { getContext } from "svelte"; export let dbid; @@ -206,12 +206,7 @@ {#if $jobMetrics.data} {#key $jobMetrics.data} - - import { getContext } from 'svelte' - // import { Button, Table, InputGroup, InputGroupText, Icon } from 'sveltestrap' + import { + Card, + CardHeader, + CardTitle, + CardBody, + Progress, + Icon, + } from "sveltestrap"; import { mean, round } from 'mathjs' // import { findThresholds } from './plots/MetricPlot.svelte' - // import { formatNumber } from './units.js' - - import { Pie } from 'svelte-chartjs'; - import { - Chart as ChartJS, - Title, - Tooltip, - Legend, - Filler, - ArcElement, - CategoryScale - } from 'chart.js'; - - ChartJS.register( - Title, - Tooltip, - Legend, - Filler, - ArcElement, - CategoryScale - ); + // import { formatNumber, scaleNumbers } from './units.js' export let job export let jobMetrics + export let view = 'job' + export let width = 'auto' - export let size = 200 - export let displayLegend = false + // console.log('CLUSTER', job.cluster) - const footprintMetrics = ['mem_used', 'mem_bw','flops_any', 'cpu_load'] // 'acc_utilization' / missing: energy , move to central config before deployment + const footprintMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // 'acc_utilization' / missing: energy , move to central config before deployment - console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) + // console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) const footprintMetricConfigs = footprintMetrics.map((fm) => { return getContext('metrics')(job.cluster, fm) }).filter( Boolean ) // Filter only "truthy" vals, see: https://stackoverflow.com/questions/28607451/removing-undefined-values-from-array - console.log("FMCs", footprintMetricConfigs) + // console.log("FMCs", footprintMetricConfigs) // const footprintMetricThresholds = footprintMetricConfigs.map((fmc) => { // Only required if scopes smaller than node required // return {name: fmc.name, ...findThresholds(fmc, 'node', job?.subCluster ? job.subCluster : '')} // Merge 2 objects @@ -47,239 +35,244 @@ // console.log("FMTs", footprintMetricThresholds) - const meanVals = footprintMetrics.map((fm) => { - let jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') // Only Node Scope + const footprintData = footprintMetrics.map((fm) => { + const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') + // ... get Mean let mv = null if (jm?.metric?.statisticsSeries) { - mv = {name: jm.name, avg: round(mean(jm.metric.statisticsSeries.mean), 2)} + mv = round(mean(jm.metric.statisticsSeries.mean), 2) } else if (jm?.metric?.series[0]) { - mv = {name: jm.name, avg: jm.metric.series[0].statistics.avg} + mv = jm.metric.series[0].statistics.avg } - + // ... get Unit + let unit = null if (jm?.metric?.unit?.base) { - return {...mv, unit: jm.metric.unit.prefix + jm.metric.unit.base} + unit = jm.metric.unit.prefix + jm.metric.unit.base } else { - return {...mv, unit: ''} + unit = '' } - - }).filter( Boolean ) - - console.log("MVs", meanVals) - - const footprintData = meanVals.map((mv) => { - const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === mv.name) - const levelPeak = metricConfig.peak - mv.avg - const levelNormal = metricConfig.normal - mv.avg - const levelCaution = metricConfig.caution - mv.avg - const levelAlert = metricConfig.alert - mv.avg - - if (mv.name !== 'mem_used') { // Alert if usage is low, peak is high good usage + // From MetricConfig: Scope only for scaling -> Not of interest here + const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === fm) + // ... get Thresholds + const levelPeak = fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) - mv : metricConfig.peak - mv // Scale flops_any down + const levelNormal = metricConfig.normal - mv + const levelCaution = metricConfig.caution - mv + const levelAlert = metricConfig.alert - mv + // Collect + if (fm !== 'mem_used') { // Alert if usage is low, peak as maxmimum possible (scaled down for flops_any) if (levelAlert > 0) { return { - data: [mv.avg, levelAlert], - color: ['hsl(0, 100%, 60%)', '#AAA'], - messages: ['Metric strongly below recommended levels!', 'Difference towards acceptable performace'], - impact: 2 - } // 'hsl(0, 100%, 35%)' + name: fm, + unit: unit, + avg: mv, + max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + color: 'danger', + message: 'Metric strongly below common levels!', + impact: 3 + } } else if (levelCaution > 0) { return { - data: [mv.avg, levelCaution], - color: ['hsl(56, 100%, 50%)', '#AAA'], - messages: ['Metric below recommended levels', 'Difference towards normal performance'], - impact: 1 - } // '#d5b60a' + name: fm, + unit: unit, + avg: mv, + max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + color: 'warning', + message: 'Metric below common levels', + impact: 2 + } } else if (levelNormal > 0) { return { - data: [mv.avg, levelNormal], - color: ['hsl(100, 100%, 60%)', '#AAA'], - messages: ['Metric within recommended levels', 'Difference towards optimal performance'], - impact: 0 - } // 'hsl(100, 100%, 35%)' + name: fm, + unit: unit, + avg: mv, + max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + color: 'success', + message: 'Metric within common levels', + impact: 1 + } } else if (levelPeak > 0) { return { - data: [mv.avg, levelPeak], - color: ['hsl(180, 100%, 60%)', '#AAA'], - messages: ['Metric performs better than recommended levels', 'Difference towards maximum capacity'], // "Perfomrs optimal"? - impact: 0 - } // 'hsl(180, 100%, 35%)' - } else { // If avg greater than configured peak: render negative diff as zero - return { - data: [mv.avg, 0], - color: ['hsl(180, 100%, 60%)', '#AAA'], - messages: ['Metric performs at maximum capacity', 'Maximum reached'], + name: fm, + unit: unit, + avg: mv, + max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + color: 'info', + message: 'Metric performs better than common levels', impact: 0 - } // 'hsl(180, 100%, 35%)' + } + } else { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger + const checkData = { + name: fm, + unit: unit, + avg: mv, + max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak + } + + if (checkData.avg >= (1.5 * checkData.max)) { + return { + ...checkData, + color: 'danger', + message: 'Metric average at least 50% above common peak value: Check data for artifacts!', + impact: -2 + } + } else if (checkData.avg >= (1.05 * checkData.max)) { + return { + ...checkData, + color: 'warning', + message: 'Metric average at least 5% above common peak value: Check data for artifacts', + impact: -1 + } + } else { + return { + ...checkData, + color: 'info', + message: 'Metric performs better than common levels', + impact: 0 + } + } } } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution - if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // If avg greater than configured peak: render negative diff as zero + if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger + const checkData = { + name: fm, + unit: unit, + avg: mv, + max: metricConfig.peak + } + if (checkData.avg >= (1.5 * checkData.max)) { + return { + ...checkData, + color: 'danger', + message: 'Memory usage at least 50% above possible maximum value: Check data for artifacts!', + impact: -2 + } + } else if (checkData.avg >= (1.05 * checkData.max)) { + return { + ...checkData, + color: 'warning', + message: 'Memory usage at least 5% above possible maximum value: Check data for artifacts!', + impact: -1 + } + } else { + return { + ...checkData, + color: 'danger', + message: 'Memory usage extremely above common levels!', + impact: 4 + } + } + } else if (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { return { - data: [mv.avg, 0], - color: ['#7F00FF', '#AAA'], - messages: ['Memory usage at maximum capacity!', 'Maximum reached'], + name: fm, + unit: unit, + avg: mv, + max: metricConfig.peak, + color: 'danger', + message: 'Memory usage extremely above common levels!', impact: 4 - } // '#5D3FD3' - } else if (levelPeak > 0 && (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0)) { - return { - data: [mv.avg, levelPeak], - color: ['#7F00FF', '#AAA'], - messages: ['Memory usage extremely above recommended levels!', 'Difference towards maximum memory capacity'], - impact: 2 - } // '#5D3FD3' + } } else if (levelAlert > 0 && (levelCaution <= 0 && levelNormal <= 0)) { return { - data: [mv.avg, levelAlert], - color: ['hsl(0, 100%, 60%)', '#AAA'], - messages: ['Memory usage strongly above recommended levels!', 'Difference towards highly alerting memory usage'], - impact: 2 - } // 'hsl(0, 100%, 35%)' + name: fm, + unit: unit, + avg: mv, + max: metricConfig.peak, + color: 'danger', + message: 'Memory usage strongly above common levels!', + impact: 3 + } } else if (levelCaution > 0 && levelNormal <= 0) { return { - data: [mv.avg, levelCaution], - color: ['hsl(56, 100%, 50%)', '#AAA'], - messages: ['Memory usage above recommended levels', 'Difference towards alerting memory usage'], - impact: 1 - } // '#d5b60a' + name: fm, + unit: unit, + avg: mv, + max: metricConfig.peak, + color: 'warning', + message: 'Memory usage above common levels', + impact: 2 + } } else { return { - data: [mv.avg, levelNormal], - color: ['hsl(100, 100%, 60%)', '#AAA'], - messages: ['Memory usage within recommended levels', 'Difference towards increased memory usage'], - impact: 0 - } // 'hsl(100, 100%, 35%)' - } - } - }) - - console.log("FPD", footprintData) - - // Collect data for chartjs - const footprintLabels = meanVals.map((mv) => [mv.name, 'Threshold']).flat() - const footprintUnits = meanVals.map((mv) => [mv.unit, mv.unit]).flat() - const footprintMessages = footprintData.map((fpd) => fpd.messages).flat() - const footprintResultSum = footprintData.map((fpd) => fpd.impact).reduce((accumulator, currentValue) => { return accumulator + currentValue }, 0) - let footprintResult = '' - - if (footprintResultSum <= 1) { - footprintResult = 'good' - } else if (footprintResultSum > 1 && footprintResultSum <= 3) { - footprintResult = 'well' - } else if (footprintResultSum > 3 && footprintResultSum <= 5) { - footprintResult = 'acceptable' - } else { - footprintResult = 'badly' - } - - $: data = { - labels: footprintLabels, - datasets: [ - { - backgroundColor: footprintData[0].color, - data: footprintData[0].data - }, - { - backgroundColor: footprintData[1].color, - data: footprintData[1].data - }, - { - backgroundColor: footprintData[2].color, - data: footprintData[2].data - }, - { - backgroundColor: footprintData[3].color, - data: footprintData[3].data - } - ] - } - - const options = { - maintainAspectRatio: false, - animation: false, - plugins: { - legend: { - display: displayLegend, - labels: { // see: https://www.chartjs.org/docs/latest/samples/other-charts/multi-series-pie.html - generateLabels: function(chart) { - // Get the default label list - const original = ChartJS.overrides.pie.plugins.legend.labels.generateLabels; - const labelsOriginal = original.call(this, chart); - - // Build an array of colors used in the datasets of the chart - let datasetColors = chart.data.datasets.map(function(e) { - return e.backgroundColor; - }); - datasetColors = datasetColors.flat(); - - // Modify the color and hide state of each label - labelsOriginal.forEach(label => { - // There are twice as many labels as there are datasets. This converts the label index into the corresponding dataset index - label.datasetIndex = (label.index - label.index % 2) / 2; - - // The hidden state must match the dataset's hidden state - label.hidden = !chart.isDatasetVisible(label.datasetIndex); - - // Change the color to match the dataset - label.fillStyle = datasetColors[label.index]; - }); - - return labelsOriginal; - } - }, - onClick: function(mouseEvent, legendItem, legend) { - // toggle the visibility of the dataset from what it currently is - legend.chart.getDatasetMeta( - legendItem.datasetIndex - ).hidden = legend.chart.isDatasetVisible(legendItem.datasetIndex); - legend.chart.update(); - } - }, - tooltip: { - callbacks: { - label: function(context) { - const labelIndex = (context.datasetIndex * 2) + context.dataIndex; - if (context.chart.data.labels[labelIndex] === 'Threshold') { - return ' -' + context.formattedValue + ' ' + footprintUnits[labelIndex] - } else { - return ' ' + context.formattedValue + ' ' + footprintUnits[labelIndex] - } - }, - title: function(context) { - const labelIndex = (context[0].datasetIndex * 2) + context[0].dataIndex; - if (context[0].chart.data.labels[labelIndex] === 'Threshold') { - return 'Until ' + context[0].chart.data.labels[labelIndex] - } else { - return 'Average ' + context[0].chart.data.labels[labelIndex] - } - }, - footer: function(context) { - const labelIndex = (context[0].datasetIndex * 2) + context[0].dataIndex; - if (context[0].chart.data.labels[labelIndex] === 'Threshold') { - return footprintMessages[labelIndex] - } else { - return footprintMessages[labelIndex] - } - } + name: fm, + unit: unit, + avg: mv, + max: metricConfig.peak, + color: 'success', + message: 'Memory usage within common levels', + impact: 1 } } } - } + }).filter( Boolean ) - + // console.log("FPD", footprintData) -
    - -
    -
    - Overall Job Performance:  Your job {job.state === 'running' ? 'performs' : 'performed'} {footprintResult}. -
    + + + {#if view === 'job'} + + + Core Metrics Footprint + + + {/if} + + {#each footprintData as fpd} +
    +
    {fpd.name}
    +
    +
    + + {#if fpd.impact === 3} + + {:else if fpd.impact === 2} + + {:else if fpd.impact === -1} + + {:else if fpd.impact === -2} + + {/if} + + {#if fpd.impact === 4} + + {:else if fpd.impact === 3} + + {:else if fpd.impact === 2} + + {:else if fpd.impact === 1} + + {:else if fpd.impact === 0} + + {:else if fpd.impact === -1} + + {:else if fpd.impact === -2} + + {/if} +
    +
    + + {fpd.avg} / {fpd.max} {fpd.unit} +
    +
    +
    +
    + +
    + {/each} + {#if job?.metaData?.message} +
    + {@html job.metaData.message} + {/if} +
    +
    - - diff --git a/web/frontend/src/JobFootprintBars.svelte b/web/frontend/src/JobFootprintBars.svelte deleted file mode 100644 index c21e2b2e..00000000 --- a/web/frontend/src/JobFootprintBars.svelte +++ /dev/null @@ -1,281 +0,0 @@ - - - - {#if view === 'job'} - - - Core Metrics Footprint - - - {/if} - - {#each footprintData as fpd} -
    -
    {fpd.name}
    -
    -
    - - {#if fpd.impact === 3} - - {:else if fpd.impact === 2} - - {:else if fpd.impact === -1} - - {:else if fpd.impact === -2} - - {/if} - - {#if fpd.impact === 4} - - {:else if fpd.impact === 3} - - {:else if fpd.impact === 2} - - {:else if fpd.impact === 1} - - {:else if fpd.impact === 0} - - {:else if fpd.impact === -1} - - {:else if fpd.impact === -2} - - {/if} -
    -
    - - {fpd.avg} / {fpd.max} {fpd.unit} -
    -
    -
    -
    - -
    - {/each} - {#if job?.metaData?.message} -
    -
    -
    - Note: {job.metaData.message} -
    -
    - {/if} -
    -
    - - - diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index 359f263d..3ecfc51e 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -15,7 +15,6 @@ import MetricPlot from "../plots/MetricPlot.svelte"; import JobInfo from "./JobInfo.svelte"; import JobFootprint from "../JobFootprint.svelte"; - import JobFootprintBars from "../JobFootprintBars.svelte"; import { maxScope, checkMetricDisabled } from "../utils.js"; export let job; @@ -139,14 +138,8 @@ {:else} {#if showFootprint} - - Date: Wed, 22 Nov 2023 12:12:36 +0100 Subject: [PATCH 18/47] Switch from title to sveltestrap tooltip --- web/frontend/src/JobFootprint.svelte | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 9a06d2f6..069e3d89 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -7,6 +7,7 @@ CardBody, Progress, Icon, + Tooltip } from "sveltestrap"; import { mean, round } from 'mathjs' // import { findThresholds } from './plots/MetricPlot.svelte' @@ -17,17 +18,17 @@ export let view = 'job' export let width = 'auto' - // console.log('CLUSTER', job.cluster) + console.log('CLUSTER', job.cluster) const footprintMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // 'acc_utilization' / missing: energy , move to central config before deployment - // console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) + console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) const footprintMetricConfigs = footprintMetrics.map((fm) => { return getContext('metrics')(job.cluster, fm) }).filter( Boolean ) // Filter only "truthy" vals, see: https://stackoverflow.com/questions/28607451/removing-undefined-values-from-array - // console.log("FMCs", footprintMetricConfigs) + console.log("FMCs", footprintMetricConfigs) // const footprintMetricThresholds = footprintMetricConfigs.map((fmc) => { // Only required if scopes smaller than node required // return {name: fmc.name, ...findThresholds(fmc, 'node', job?.subCluster ? job.subCluster : '')} // Merge 2 objects @@ -205,7 +206,7 @@ } }).filter( Boolean ) - // console.log("FPD", footprintData) + console.log("FPD", footprintData) @@ -218,10 +219,10 @@ {/if} - {#each footprintData as fpd} + {#each footprintData as fpd, index}
    -
    {fpd.name}
    -
    +
     {fpd.name}
    +
    {#if fpd.impact === 3} @@ -252,11 +253,12 @@
    - {fpd.avg} / {fpd.max} {fpd.unit} + {fpd.avg} / {fpd.max} {fpd.unit}  
    + {fpd.message}
    -
    +
    Date: Thu, 23 Nov 2023 12:15:35 +0100 Subject: [PATCH 19/47] Add threshold scaling based on used resources - required for shared jobs --- web/frontend/src/JobFootprint.svelte | 135 +++++++++++++++++++++------ 1 file changed, 104 insertions(+), 31 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 069e3d89..5b5e3b1a 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -10,7 +10,6 @@ Tooltip } from "sveltestrap"; import { mean, round } from 'mathjs' - // import { findThresholds } from './plots/MetricPlot.svelte' // import { formatNumber, scaleNumbers } from './units.js' export let job @@ -18,9 +17,29 @@ export let view = 'job' export let width = 'auto' - console.log('CLUSTER', job.cluster) + const isAcceleratedJob = (job.numAcc !== 0) + const isSharedJob = (job.exclusive !== 1) - const footprintMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // 'acc_utilization' / missing: energy , move to central config before deployment + // console.log('JOB', job) + console.log('ACCELERATED?', isAcceleratedJob) + console.log('SHARED?', isSharedJob) + + const clusters = getContext('clusters') + const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster) + + console.log('SCC', subclusterConfig) + + /* NOTES: + - 'mem_allocated' für shared jobs (noch todo / nicht in den jobdaten enthalten bisher) + > For now: 'acc_util' gegen 'mem_used' für alex + - Energy Metric Missiing, muss eingebaut werden + - Diese Config in config.json? + - Erste 5 / letzte 5 pts für avg auslassen? (Wenn minimallänge erreicht?) // Peak limited => Hier eigentlich nicht mein Proble, Ich zeige nur daten an die geliefert werden + */ + + const footprintMetrics = isAcceleratedJob ? + ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw'] : + ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) @@ -30,20 +49,20 @@ console.log("FMCs", footprintMetricConfigs) - // const footprintMetricThresholds = footprintMetricConfigs.map((fmc) => { // Only required if scopes smaller than node required - // return {name: fmc.name, ...findThresholds(fmc, 'node', job?.subCluster ? job.subCluster : '')} // Merge 2 objects - // }).filter( Boolean ) + const footprintMetricThresholds = footprintMetricConfigs.map((fmc) => { + return {name: fmc.name, ...findJobThresholds(fmc, job, subclusterConfig)} + }).filter( Boolean ) - // console.log("FMTs", footprintMetricThresholds) + console.log("FMTs", footprintMetricThresholds) const footprintData = footprintMetrics.map((fm) => { const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') // ... get Mean let mv = null if (jm?.metric?.statisticsSeries) { - mv = round(mean(jm.metric.statisticsSeries.mean), 2) + mv = round(mean(jm.metric.statisticsSeries.mean), 2) // see above } else if (jm?.metric?.series[0]) { - mv = jm.metric.series[0].statistics.avg + mv = jm.metric.series[0].statistics.avg // see above } // ... get Unit let unit = null @@ -52,13 +71,12 @@ } else { unit = '' } - // From MetricConfig: Scope only for scaling -> Not of interest here - const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === fm) - // ... get Thresholds - const levelPeak = fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) - mv : metricConfig.peak - mv // Scale flops_any down - const levelNormal = metricConfig.normal - mv - const levelCaution = metricConfig.caution - mv - const levelAlert = metricConfig.alert - mv + // Get Threshold Limits from scaled Thresholds per Metric + const scaledThresholds = footprintMetricThresholds.find((fmc) => fmc.name === fm) + const levelPeak = fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) - mv : scaledThresholds.peak - mv // Scale flops_any down + const levelNormal = scaledThresholds.normal - mv + const levelCaution = scaledThresholds.caution - mv + const levelAlert = scaledThresholds.alert - mv // Collect if (fm !== 'mem_used') { // Alert if usage is low, peak as maxmimum possible (scaled down for flops_any) if (levelAlert > 0) { @@ -66,7 +84,7 @@ name: fm, unit: unit, avg: mv, - max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, color: 'danger', message: 'Metric strongly below common levels!', impact: 3 @@ -76,7 +94,7 @@ name: fm, unit: unit, avg: mv, - max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, color: 'warning', message: 'Metric below common levels', impact: 2 @@ -86,7 +104,7 @@ name: fm, unit: unit, avg: mv, - max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, color: 'success', message: 'Metric within common levels', impact: 1 @@ -96,7 +114,7 @@ name: fm, unit: unit, avg: mv, - max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak, + max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, color: 'info', message: 'Metric performs better than common levels', impact: 0 @@ -106,20 +124,20 @@ name: fm, unit: unit, avg: mv, - max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak + max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak } if (checkData.avg >= (1.5 * checkData.max)) { return { ...checkData, - color: 'danger', + color: 'secondary', message: 'Metric average at least 50% above common peak value: Check data for artifacts!', impact: -2 } } else if (checkData.avg >= (1.05 * checkData.max)) { return { ...checkData, - color: 'warning', + color: 'secondary', message: 'Metric average at least 5% above common peak value: Check data for artifacts', impact: -1 } @@ -138,19 +156,19 @@ name: fm, unit: unit, avg: mv, - max: metricConfig.peak + max: scaledThresholds.peak } if (checkData.avg >= (1.5 * checkData.max)) { return { ...checkData, - color: 'danger', + color: 'secondary', message: 'Memory usage at least 50% above possible maximum value: Check data for artifacts!', impact: -2 } } else if (checkData.avg >= (1.05 * checkData.max)) { return { ...checkData, - color: 'warning', + color: 'secondary', message: 'Memory usage at least 5% above possible maximum value: Check data for artifacts!', impact: -1 } @@ -167,7 +185,7 @@ name: fm, unit: unit, avg: mv, - max: metricConfig.peak, + max: scaledThresholds.peak, color: 'danger', message: 'Memory usage extremely above common levels!', impact: 4 @@ -177,7 +195,7 @@ name: fm, unit: unit, avg: mv, - max: metricConfig.peak, + max: scaledThresholds.peak, color: 'danger', message: 'Memory usage strongly above common levels!', impact: 3 @@ -187,7 +205,7 @@ name: fm, unit: unit, avg: mv, - max: metricConfig.peak, + max: scaledThresholds.peak, color: 'warning', message: 'Memory usage above common levels', impact: 2 @@ -197,7 +215,7 @@ name: fm, unit: unit, avg: mv, - max: metricConfig.peak, + max: scaledThresholds.peak, color: 'success', message: 'Memory usage within common levels', impact: 1 @@ -210,11 +228,66 @@ + + {#if view === 'job'} - Core Metrics Footprint + Core Metrics Footprint {isSharedJob ? '(Scaled)' : ''} {/if} From 4e375ff32bd7c024cb9d3515696f54deaa258f80 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 24 Nov 2023 10:36:22 +0100 Subject: [PATCH 20/47] Handle accelerated and shared jobs --- web/frontend/src/JobFootprint.svelte | 87 +++++++++++++++++----------- web/frontend/src/joblist/Row.svelte | 2 +- 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 5b5e3b1a..cf3e227a 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -20,7 +20,7 @@ const isAcceleratedJob = (job.numAcc !== 0) const isSharedJob = (job.exclusive !== 1) - // console.log('JOB', job) + console.log('JOB', job) console.log('ACCELERATED?', isAcceleratedJob) console.log('SHARED?', isSharedJob) @@ -34,12 +34,15 @@ > For now: 'acc_util' gegen 'mem_used' für alex - Energy Metric Missiing, muss eingebaut werden - Diese Config in config.json? - - Erste 5 / letzte 5 pts für avg auslassen? (Wenn minimallänge erreicht?) // Peak limited => Hier eigentlich nicht mein Proble, Ich zeige nur daten an die geliefert werden */ - const footprintMetrics = isAcceleratedJob ? - ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw'] : - ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] + const footprintMetrics = isAcceleratedJob + ? isSharedJob + ? ['cpu_load', 'flops_any', 'acc_utilization'] + : ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw'] + : isSharedJob + ? ['cpu_load', 'flops_any', 'mem_used'] + : ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) @@ -60,9 +63,12 @@ // ... get Mean let mv = null if (jm?.metric?.statisticsSeries) { - mv = round(mean(jm.metric.statisticsSeries.mean), 2) // see above - } else if (jm?.metric?.series[0]) { - mv = jm.metric.series[0].statistics.avg // see above + mv = round(mean(jm.metric.statisticsSeries.mean), 2) + } else if (jm?.metric?.series?.length > 1) { + const avgs = jm.metric.series.map(jms => jms.statistics.avg) + mv = round(mean(avgs), 2) + } else { + mv = jm.metric.series[0].statistics.avg } // ... get Unit let unit = null @@ -238,15 +244,11 @@ return null } - if (job.numHWThreads == subClusterConfig.topology.node.length || // Job uses all available HWTs of one node - job.numAcc == subClusterConfig.topology.accelerators.length || // Job uses all available GPUs of one node - metricConfig.aggregation == 'avg' ){ // Metric uses "average" aggregation method - - console.log('Job uses all available Resources of one node OR uses "average" aggregation method, use unscaled thresholds') - - let subclusterThresholds = metricConfig.subClusters.find(sc => sc.name == subClusterConfig.name) + let subclusterThresholds = metricConfig.subClusters.find(sc => sc.name == subClusterConfig.name) + if (job.exclusive === 1) { // Exclusive: Use as defined + console.log('Job is exclusive: Use as defined') if (subclusterThresholds) { - console.log('subClusterThresholds found, use subCluster specific thresholds:', subclusterThresholds) + console.log('subClusterThresholds found: use subCluster specific thresholds', subclusterThresholds) return { peak: subclusterThresholds.peak, normal: subclusterThresholds.normal, @@ -254,32 +256,47 @@ alert: subclusterThresholds.alert } } - return { peak: metricConfig.peak, normal: metricConfig.normal, caution: metricConfig.caution, alert: metricConfig.alert } - } - - if (metricConfig.aggregation != 'sum') { - console.warn('Missing or unkown aggregation mode (sum/avg) for metric:', metricConfig) - return null - } - - /* Adapt based on numAccs? */ - const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length - //const fractionAcc = job.numAcc / subClusterConfig.topology.accelerators.length - - console.log('Fraction', jobFraction) + } else { // Shared + if (metricConfig.aggregation === 'avg' ){ + console.log('metric uses "average" aggregation method: use unscaled thresholds except if cpu_load') + if (subclusterThresholds) { + console.log('subClusterThresholds found: use subCluster specific thresholds', subclusterThresholds) + console.log('PEAK/NORMAL USED', metricConfig.name === 'cpu_load' ? job.numHWThreads : subclusterThresholds.peak) + return { // If 'cpu_load': Peak/Normal === #HWThreads, keep other thresholds + peak: metricConfig.name === 'cpu_load' ? job.numHWThreads : subclusterThresholds.peak, + normal: metricConfig.name === 'cpu_load' ? job.numHWThreads : subclusterThresholds.normal, + caution: subclusterThresholds.caution, + alert: subclusterThresholds.alert + } + } + console.log('PEAK/NORMAL USED', metricConfig.name === 'cpu_load' ? job.numHWThreads : metricConfig.peak) + return { + peak: metricConfig.name === 'cpu_load' ? job.numHWThreads : metricConfig.peak, + normal: metricConfig.name === 'cpu_load' ? job.numHWThreads : metricConfig.normal, + caution: metricConfig.caution, + alert: metricConfig.alert + } + } else if (metricConfig.aggregation === 'sum' ){ + const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length + console.log('Fraction', jobFraction) - return { - peak: round((metricConfig.peak * jobFraction), 0), - normal: round((metricConfig.normal * jobFraction), 0), - caution: round((metricConfig.caution * jobFraction), 0), - alert: round((metricConfig.alert * jobFraction), 0) - } + return { + peak: round((metricConfig.peak * jobFraction), 0), + normal: round((metricConfig.normal * jobFraction), 0), + caution: round((metricConfig.caution * jobFraction), 0), + alert: round((metricConfig.alert * jobFraction), 0) + } + } else { + console.warn('Missing or unkown aggregation mode (sum/avg) for metric:', metricConfig) + return null + } + } // Other job.exclusive cases? } diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index 3ecfc51e..71bc8057 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -72,7 +72,7 @@ let queryMetrics = null $: if (showFootprint) { - queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct) + queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', 'acc_utilization', ...metrics].filter(distinct) scopes = ["node"] } else { queryMetrics = [...metrics] From e34623b1ceeae2124ffe4f0ffddf83dfad943b50 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 24 Nov 2023 15:11:38 +0100 Subject: [PATCH 21/47] Add db average stats to gql, use in footprint --- api/schema.graphqls | 5 + internal/graph/generated/generated.go | 226 ++++++++++++++++++++++++ internal/repository/job.go | 5 +- pkg/schema/job.go | 8 +- tools/archive-migration/job.go | 8 +- web/frontend/src/Job.root.svelte | 3 +- web/frontend/src/JobFootprint.svelte | 21 ++- web/frontend/src/joblist/JobList.svelte | 3 + 8 files changed, 264 insertions(+), 15 deletions(-) diff --git a/api/schema.graphqls b/api/schema.graphqls index 69e32e2b..01eabc21 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -28,6 +28,11 @@ type Job { resources: [Resource!]! concurrentJobs: JobLinkResultList + memUsedMax: Float + flopsAnyAvg: Float + memBwAvg: Float + loadAvg: Float + metaData: Any userData: User } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index f29e2a03..6778e76d 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -88,8 +88,12 @@ type ComplexityRoot struct { ConcurrentJobs func(childComplexity int) int Duration func(childComplexity int) int Exclusive func(childComplexity int) int + FlopsAnyAvg func(childComplexity int) int ID func(childComplexity int) int JobID func(childComplexity int) int + LoadAvg func(childComplexity int) int + MemBwAvg func(childComplexity int) int + MemUsedMax func(childComplexity int) int MetaData func(childComplexity int) int MonitoringStatus func(childComplexity int) int NumAcc func(childComplexity int) int @@ -303,6 +307,7 @@ type JobResolver interface { Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) + MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) UserData(ctx context.Context, obj *schema.Job) (*model.User, error) } @@ -485,6 +490,13 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Job.Exclusive(childComplexity), true + case "Job.flopsAnyAvg": + if e.complexity.Job.FlopsAnyAvg == nil { + break + } + + return e.complexity.Job.FlopsAnyAvg(childComplexity), true + case "Job.id": if e.complexity.Job.ID == nil { break @@ -499,6 +511,27 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Job.JobID(childComplexity), true + case "Job.loadAvg": + if e.complexity.Job.LoadAvg == nil { + break + } + + return e.complexity.Job.LoadAvg(childComplexity), true + + case "Job.memBwAvg": + if e.complexity.Job.MemBwAvg == nil { + break + } + + return e.complexity.Job.MemBwAvg(childComplexity), true + + case "Job.memUsedMax": + if e.complexity.Job.MemUsedMax == nil { + break + } + + return e.complexity.Job.MemUsedMax(childComplexity), true + case "Job.metaData": if e.complexity.Job.MetaData == nil { break @@ -1628,6 +1661,11 @@ type Job { resources: [Resource!]! concurrentJobs: JobLinkResultList + memUsedMax: Float + flopsAnyAvg: Float + memBwAvg: Float + loadAvg: Float + metaData: Any userData: User } @@ -4054,6 +4092,170 @@ func (ec *executionContext) fieldContext_Job_concurrentJobs(ctx context.Context, return fc, nil } +func (ec *executionContext) _Job_memUsedMax(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_memUsedMax(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.MemUsedMax, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(float64) + fc.Result = res + return ec.marshalOFloat2float64(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Job_memUsedMax(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Job", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Float does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Job_flopsAnyAvg(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_flopsAnyAvg(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.FlopsAnyAvg, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(float64) + fc.Result = res + return ec.marshalOFloat2float64(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Job_flopsAnyAvg(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Job", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Float does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Job_memBwAvg(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_memBwAvg(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.MemBwAvg, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(float64) + fc.Result = res + return ec.marshalOFloat2float64(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Job_memBwAvg(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Job", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Float does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Job_loadAvg(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_loadAvg(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.LoadAvg, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(float64) + fc.Result = res + return ec.marshalOFloat2float64(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Job_loadAvg(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Job", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Float does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _Job_metaData(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { fc, err := ec.fieldContext_Job_metaData(ctx, field) if err != nil { @@ -4778,6 +4980,14 @@ func (ec *executionContext) fieldContext_JobResultList_items(ctx context.Context return ec.fieldContext_Job_resources(ctx, field) case "concurrentJobs": return ec.fieldContext_Job_concurrentJobs(ctx, field) + case "memUsedMax": + return ec.fieldContext_Job_memUsedMax(ctx, field) + case "flopsAnyAvg": + return ec.fieldContext_Job_flopsAnyAvg(ctx, field) + case "memBwAvg": + return ec.fieldContext_Job_memBwAvg(ctx, field) + case "loadAvg": + return ec.fieldContext_Job_loadAvg(ctx, field) case "metaData": return ec.fieldContext_Job_metaData(ctx, field) case "userData": @@ -7152,6 +7362,14 @@ func (ec *executionContext) fieldContext_Query_job(ctx context.Context, field gr return ec.fieldContext_Job_resources(ctx, field) case "concurrentJobs": return ec.fieldContext_Job_concurrentJobs(ctx, field) + case "memUsedMax": + return ec.fieldContext_Job_memUsedMax(ctx, field) + case "flopsAnyAvg": + return ec.fieldContext_Job_flopsAnyAvg(ctx, field) + case "memBwAvg": + return ec.fieldContext_Job_memBwAvg(ctx, field) + case "loadAvg": + return ec.fieldContext_Job_loadAvg(ctx, field) case "metaData": return ec.fieldContext_Job_metaData(ctx, field) case "userData": @@ -12504,6 +12722,14 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj } out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + case "memUsedMax": + out.Values[i] = ec._Job_memUsedMax(ctx, field, obj) + case "flopsAnyAvg": + out.Values[i] = ec._Job_flopsAnyAvg(ctx, field, obj) + case "memBwAvg": + out.Values[i] = ec._Job_memBwAvg(ctx, field, obj) + case "loadAvg": + out.Values[i] = ec._Job_loadAvg(ctx, field, obj) case "metaData": field := field diff --git a/internal/repository/job.go b/internal/repository/job.go index 76834d12..e1a997a9 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -60,7 +60,7 @@ func GetJobRepository() *JobRepository { var jobColumns []string = []string{ "job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id", "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", - "job.duration", "job.walltime", "job.resources", // "job.meta_data", + "job.duration", "job.walltime", "job.resources", "job.mem_used_max", "job.flops_any_avg", "job.mem_bw_avg", "job.load_avg", // "job.meta_data", } func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { @@ -68,7 +68,7 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { if err := row.Scan( &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, - &job.Duration, &job.Walltime, &job.RawResources /*&job.RawMetaData*/); err != nil { + &job.Duration, &job.Walltime, &job.RawResources, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.LoadAvg /*&job.RawMetaData*/); err != nil { log.Warnf("Error while scanning rows (Job): %v", err) return nil, err } @@ -483,6 +483,7 @@ func (r *JobRepository) MarkArchived( case "mem_bw": stmt = stmt.Set("mem_bw_avg", stats.Avg) case "load": + stmt = stmt.Set("load_avg", stats.Avg) case "cpu_load": stmt = stmt.Set("load_avg", stats.Avg) case "net_bw": diff --git a/pkg/schema/job.go b/pkg/schema/job.go index ed3a8b6c..90bf2cbe 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -54,10 +54,10 @@ type Job struct { BaseJob StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type - MemUsedMax float64 `json:"-" db:"mem_used_max"` // MemUsedMax as Float64 - FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"` // FlopsAnyAvg as Float64 - MemBwAvg float64 `json:"-" db:"mem_bw_avg"` // MemBwAvg as Float64 - LoadAvg float64 `json:"-" db:"load_avg"` // LoadAvg as Float64 + MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64 + FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64 + MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64 + LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64 NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64 NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64 FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64 diff --git a/tools/archive-migration/job.go b/tools/archive-migration/job.go index cd54d6cc..0dff4b42 100644 --- a/tools/archive-migration/job.go +++ b/tools/archive-migration/job.go @@ -52,10 +52,10 @@ type Job struct { BaseJob StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type - MemUsedMax float64 `json:"-" db:"mem_used_max"` // MemUsedMax as Float64 - FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"` // FlopsAnyAvg as Float64 - MemBwAvg float64 `json:"-" db:"mem_bw_avg"` // MemBwAvg as Float64 - LoadAvg float64 `json:"-" db:"load_avg"` // LoadAvg as Float64 + MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64 + FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64 + MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64 + LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64 NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64 NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64 FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64 diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 7bd40f81..1b66e339 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -47,7 +47,8 @@ resources { hostname, hwthreads, accelerators }, metaData, userData { name, email }, - concurrentJobs { items { id, jobId }, count, listQuery } + concurrentJobs { items { id, jobId }, count, listQuery }, + flopsAnyAvg, memBwAvg, loadAvg } `); diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index cf3e227a..4313030c 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -31,9 +31,9 @@ /* NOTES: - 'mem_allocated' für shared jobs (noch todo / nicht in den jobdaten enthalten bisher) - > For now: 'acc_util' gegen 'mem_used' für alex + > For now: 'acc_util' gegen 'mem_used' für alex: Mem bw für shared weggefallen: dann wieder vier bars - Energy Metric Missiing, muss eingebaut werden - - Diese Config in config.json? + - footprintMetrics Config in config.json? */ const footprintMetrics = isAcceleratedJob @@ -60,9 +60,15 @@ const footprintData = footprintMetrics.map((fm) => { const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') - // ... get Mean + // ... get Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata let mv = null - if (jm?.metric?.statisticsSeries) { + if (fm === 'cpu_load' && job.loadAvg !== 0) { + mv = round(job.loadAvg, 2) + } else if (fm === 'flops_any' && job.flopsAnyAvg !== 0) { + mv = round(job.flopsAnyAvg, 2) + } else if (fm === 'mem_bw' && job.memBwAvg !== 0) { + mv = round(job.memBwAvg, 2) + } else if (jm?.metric?.statisticsSeries) { mv = round(mean(jm.metric.statisticsSeries.mean), 2) } else if (jm?.metric?.series?.length > 1) { const avgs = jm.metric.series.map(jms => jms.statistics.avg) @@ -356,6 +362,13 @@ />
    {/each} + {#if job?.metaData?.message}
    {@html job.metaData.message} diff --git a/web/frontend/src/joblist/JobList.svelte b/web/frontend/src/joblist/JobList.svelte index 80363616..5f8d89b8 100644 --- a/web/frontend/src/joblist/JobList.svelte +++ b/web/frontend/src/joblist/JobList.svelte @@ -74,6 +74,9 @@ name } metaData + flopsAnyAvg + memBwAvg + loadAvg } count } From b8213ef6bea754ba6ff79875c86a8435679683c2 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 24 Nov 2023 17:22:06 +0100 Subject: [PATCH 22/47] Remove logs, reduce code --- web/frontend/src/JobFootprint.svelte | 246 +++++++++------------------ 1 file changed, 80 insertions(+), 166 deletions(-) diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 4313030c..20b03d69 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -10,57 +10,25 @@ Tooltip } from "sveltestrap"; import { mean, round } from 'mathjs' - // import { formatNumber, scaleNumbers } from './units.js' export let job export let jobMetrics export let view = 'job' export let width = 'auto' - const isAcceleratedJob = (job.numAcc !== 0) - const isSharedJob = (job.exclusive !== 1) - - console.log('JOB', job) - console.log('ACCELERATED?', isAcceleratedJob) - console.log('SHARED?', isSharedJob) - - const clusters = getContext('clusters') + const clusters = getContext('clusters') const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster) - console.log('SCC', subclusterConfig) - - /* NOTES: - - 'mem_allocated' für shared jobs (noch todo / nicht in den jobdaten enthalten bisher) - > For now: 'acc_util' gegen 'mem_used' für alex: Mem bw für shared weggefallen: dann wieder vier bars - - Energy Metric Missiing, muss eingebaut werden - - footprintMetrics Config in config.json? - */ - - const footprintMetrics = isAcceleratedJob - ? isSharedJob + const footprintMetrics = (job.numAcc !== 0) + ? (job.exclusive !== 1) ? ['cpu_load', 'flops_any', 'acc_utilization'] : ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw'] - : isSharedJob + : (job.exclusive !== 1) ? ['cpu_load', 'flops_any', 'mem_used'] : ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] - console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name))) - - const footprintMetricConfigs = footprintMetrics.map((fm) => { - return getContext('metrics')(job.cluster, fm) - }).filter( Boolean ) // Filter only "truthy" vals, see: https://stackoverflow.com/questions/28607451/removing-undefined-values-from-array - - console.log("FMCs", footprintMetricConfigs) - - const footprintMetricThresholds = footprintMetricConfigs.map((fmc) => { - return {name: fmc.name, ...findJobThresholds(fmc, job, subclusterConfig)} - }).filter( Boolean ) - - console.log("FMTs", footprintMetricThresholds) - const footprintData = footprintMetrics.map((fm) => { - const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') - // ... get Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata + // Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata let mv = null if (fm === 'cpu_load' && job.loadAvg !== 0) { mv = round(job.loadAvg, 2) @@ -68,94 +36,90 @@ mv = round(job.flopsAnyAvg, 2) } else if (fm === 'mem_bw' && job.memBwAvg !== 0) { mv = round(job.memBwAvg, 2) - } else if (jm?.metric?.statisticsSeries) { - mv = round(mean(jm.metric.statisticsSeries.mean), 2) - } else if (jm?.metric?.series?.length > 1) { - const avgs = jm.metric.series.map(jms => jms.statistics.avg) - mv = round(mean(avgs), 2) - } else { - mv = jm.metric.series[0].statistics.avg + } else { // Calculate from jobMetrics + const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') + if (jm?.metric?.statisticsSeries) { + mv = round(mean(jm.metric.statisticsSeries.mean), 2) + } else if (jm?.metric?.series?.length > 1) { + const avgs = jm.metric.series.map(jms => jms.statistics.avg) + mv = round(mean(avgs), 2) + } else { + mv = jm.metric.series[0].statistics.avg + } } - // ... get Unit + + // Unit + const fmc = getContext('metrics')(job.cluster, fm) let unit = null - if (jm?.metric?.unit?.base) { - unit = jm.metric.unit.prefix + jm.metric.unit.base + if (fmc?.unit?.base) { + unit = fmc.unit.prefix + fmc.unit.base } else { unit = '' } - // Get Threshold Limits from scaled Thresholds per Metric - const scaledThresholds = footprintMetricThresholds.find((fmc) => fmc.name === fm) - const levelPeak = fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) - mv : scaledThresholds.peak - mv // Scale flops_any down - const levelNormal = scaledThresholds.normal - mv - const levelCaution = scaledThresholds.caution - mv - const levelAlert = scaledThresholds.alert - mv + + // Threshold / -Differences + const fmt = findJobThresholds(job, fmc, subclusterConfig) + const levelPeak = fm === 'flops_any' ? round((fmt.peak * 0.85), 0) - mv : fmt.peak - mv // Scale flops_any down + const levelNormal = fmt.normal - mv + const levelCaution = fmt.caution - mv + const levelAlert = fmt.alert - mv + + // Define basic data + const fmBase = { + name: fm, + unit: unit, + avg: mv, + max: fm === 'flops_any' ? round((fmt.peak * 0.85), 0) : fmt.peak + } + // Collect if (fm !== 'mem_used') { // Alert if usage is low, peak as maxmimum possible (scaled down for flops_any) if (levelAlert > 0) { return { - name: fm, - unit: unit, - avg: mv, - max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, + ...fmBase, color: 'danger', message: 'Metric strongly below common levels!', impact: 3 } } else if (levelCaution > 0) { return { - name: fm, - unit: unit, - avg: mv, - max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, + ...fmBase, color: 'warning', message: 'Metric below common levels', impact: 2 } } else if (levelNormal > 0) { return { - name: fm, - unit: unit, - avg: mv, - max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, + ...fmBase, color: 'success', message: 'Metric within common levels', impact: 1 } } else if (levelPeak > 0) { return { - name: fm, - unit: unit, - avg: mv, - max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak, + ...fmBase, color: 'info', message: 'Metric performs better than common levels', impact: 0 } } else { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger - const checkData = { - name: fm, - unit: unit, - avg: mv, - max: fm === 'flops_any' ? round((scaledThresholds.peak * 0.85), 0) : scaledThresholds.peak - } - - if (checkData.avg >= (1.5 * checkData.max)) { + if (fmBase.avg >= (1.5 * fmBase.max)) { return { - ...checkData, + ...fmBase, color: 'secondary', message: 'Metric average at least 50% above common peak value: Check data for artifacts!', impact: -2 } - } else if (checkData.avg >= (1.05 * checkData.max)) { + } else if (fmBase.avg >= (1.05 * fmBase.max)) { return { - ...checkData, + ...fmBase, color: 'secondary', message: 'Metric average at least 5% above common peak value: Check data for artifacts', impact: -1 } } else { return { - ...checkData, + ...fmBase, color: 'info', message: 'Metric performs better than common levels', impact: 0 @@ -164,29 +128,23 @@ } } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger - const checkData = { - name: fm, - unit: unit, - avg: mv, - max: scaledThresholds.peak - } - if (checkData.avg >= (1.5 * checkData.max)) { + if (fmBase.avg >= (1.5 * fmBase.max)) { return { - ...checkData, + ...fmBase, color: 'secondary', message: 'Memory usage at least 50% above possible maximum value: Check data for artifacts!', impact: -2 } - } else if (checkData.avg >= (1.05 * checkData.max)) { + } else if (fmBase.avg >= (1.05 * fmBase.max)) { return { - ...checkData, + ...fmBase, color: 'secondary', message: 'Memory usage at least 5% above possible maximum value: Check data for artifacts!', impact: -1 } } else { return { - ...checkData, + ...fmBase, color: 'danger', message: 'Memory usage extremely above common levels!', impact: 4 @@ -194,109 +152,72 @@ } } else if (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { return { - name: fm, - unit: unit, - avg: mv, - max: scaledThresholds.peak, + ...fmBase, color: 'danger', message: 'Memory usage extremely above common levels!', impact: 4 } } else if (levelAlert > 0 && (levelCaution <= 0 && levelNormal <= 0)) { return { - name: fm, - unit: unit, - avg: mv, - max: scaledThresholds.peak, + ...fmBase, color: 'danger', message: 'Memory usage strongly above common levels!', impact: 3 } } else if (levelCaution > 0 && levelNormal <= 0) { return { - name: fm, - unit: unit, - avg: mv, - max: scaledThresholds.peak, + ...fmBase, color: 'warning', message: 'Memory usage above common levels', impact: 2 } } else { return { - name: fm, - unit: unit, - avg: mv, - max: scaledThresholds.peak, + ...fmBase, color: 'success', message: 'Memory usage within common levels', impact: 1 } } } - }).filter( Boolean ) - - console.log("FPD", footprintData) - + }) + + + + (isHistogramConfigOpen = !isHistogramConfigOpen)}> + + Select metrics presented in histograms + + + + + {#each availableMetrics as metric (metric)} + + updateConfiguration({ + name: cluster ? `user_view_histogramMetrics:${cluster}` : 'user_view_histogramMetrics', + value: metricsInHistograms + })} /> + + {metric} + + {/each} + + + + + + diff --git a/web/frontend/src/Status.root.svelte b/web/frontend/src/Status.root.svelte index fffbfde1..563978dd 100644 --- a/web/frontend/src/Status.root.svelte +++ b/web/frontend/src/Status.root.svelte @@ -63,6 +63,8 @@ option.key == ccconfig.status_view_selectedTopUserCategory ); + let metricsInHistograms = ccconfig[`status_view_histogramMetrics:${cluster}`] || ccconfig.status_view_histogramMetrics + const client = getContextClient(); $: mainQuery = queryStore({ client: client, diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 3871f601..34c56154 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -1,7 +1,7 @@ - + function closeAndApply() { + metricsInHistograms = [...pendingMetrics] // Set for parent + + updateConfiguration({ + name: cluster ? `user_view_histogramMetrics:${cluster}` : 'user_view_histogramMetrics', + value: metricsInHistograms + }) - (isHistogramConfigOpen = !isHistogramConfigOpen)}> + isOpen = false + } + + + (isOpen = !isOpen)}> Select metrics presented in histograms - {#each availableMetrics as metric (metric)} - updateConfiguration({ - name: cluster ? `user_view_histogramMetrics:${cluster}` : 'user_view_histogramMetrics', - value: metricsInHistograms - })} /> - + {metric} {/each} - + diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 16f22d63..e216aa6f 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -25,9 +25,10 @@ let jobFilters = []; let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false let metrics = ccconfig.plot_list_selectedMetrics, isMetricsSelectionOpen = false - let w1, w2, histogramHeight = 250 + let w1, w2, histogramHeight = 250, isHistogramSelectionOpen = false let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null - let metricsInHistograms = ccconfig[`user_view_histogramMetrics:${selectedCluster}`] || ccconfig.user_view_histogramMetrics || [] + + $: metricsInHistograms = selectedCluster ? ccconfig[`user_view_histogramMetrics:${selectedCluster}`] : (ccconfig.user_view_histogramMetrics || []) const client = getContextClient(); $: stats = queryStore({ @@ -73,9 +74,11 @@ Metrics - + {/key} @@ -219,4 +222,9 @@ bind:cluster={selectedCluster} configName="plot_list_selectedMetrics" bind:metrics={metrics} - bind:isOpen={isMetricsSelectionOpen} /> \ No newline at end of file + bind:isOpen={isMetricsSelectionOpen} /> + + \ No newline at end of file From 1185737eaa1b48713bae45556dc77b79e9675454 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 8 Dec 2023 12:03:04 +0100 Subject: [PATCH 36/47] Add metrics to histoselect, add userfilters - edit struct to make only count return required --- api/schema.graphqls | 6 +- internal/graph/generated/generated.go | 88 +++++++++------------- internal/graph/model/models_gen.go | 8 +- internal/repository/query.go | 2 +- internal/repository/stats.go | 74 +++++++++++------- web/frontend/src/HistogramSelection.svelte | 2 +- web/frontend/src/User.root.svelte | 2 +- web/frontend/src/utils.js | 18 +++-- 8 files changed, 103 insertions(+), 97 deletions(-) diff --git a/api/schema.graphqls b/api/schema.graphqls index 21a9ad2e..8a43a54c 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -293,10 +293,10 @@ type MetricHistoPoints { } type MetricHistoPoint { - min: Int! - max: Int! + bin: Int count: Int! - bin: Int! + min: Int + max: Int } type JobsStatistics { diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index f3d4f8a4..12d829a8 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -1969,10 +1969,10 @@ type MetricHistoPoints { } type MetricHistoPoint { - min: Int! - max: Int! + bin: Int count: Int! - bin: Int! + min: Int + max: Int } type JobsStatistics { @@ -6336,8 +6336,8 @@ func (ec *executionContext) fieldContext_MetricFootprints_data(ctx context.Conte return fc, nil } -func (ec *executionContext) _MetricHistoPoint_min(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_MetricHistoPoint_min(ctx, field) +func (ec *executionContext) _MetricHistoPoint_bin(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_MetricHistoPoint_bin(ctx, field) if err != nil { return graphql.Null } @@ -6350,24 +6350,21 @@ func (ec *executionContext) _MetricHistoPoint_min(ctx context.Context, field gra }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { ctx = rctx // use context from middleware stack in children - return obj.Min, nil + return obj.Bin, nil }) if err != nil { ec.Error(ctx, err) return graphql.Null } if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } return graphql.Null } - res := resTmp.(int) + res := resTmp.(*int) fc.Result = res - return ec.marshalNInt2int(ctx, field.Selections, res) + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_MetricHistoPoint_min(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_MetricHistoPoint_bin(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "MetricHistoPoint", Field: field, @@ -6380,8 +6377,8 @@ func (ec *executionContext) fieldContext_MetricHistoPoint_min(ctx context.Contex return fc, nil } -func (ec *executionContext) _MetricHistoPoint_max(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_MetricHistoPoint_max(ctx, field) +func (ec *executionContext) _MetricHistoPoint_count(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_MetricHistoPoint_count(ctx, field) if err != nil { return graphql.Null } @@ -6394,7 +6391,7 @@ func (ec *executionContext) _MetricHistoPoint_max(ctx context.Context, field gra }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { ctx = rctx // use context from middleware stack in children - return obj.Max, nil + return obj.Count, nil }) if err != nil { ec.Error(ctx, err) @@ -6411,7 +6408,7 @@ func (ec *executionContext) _MetricHistoPoint_max(ctx context.Context, field gra return ec.marshalNInt2int(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_MetricHistoPoint_max(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_MetricHistoPoint_count(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "MetricHistoPoint", Field: field, @@ -6424,8 +6421,8 @@ func (ec *executionContext) fieldContext_MetricHistoPoint_max(ctx context.Contex return fc, nil } -func (ec *executionContext) _MetricHistoPoint_count(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_MetricHistoPoint_count(ctx, field) +func (ec *executionContext) _MetricHistoPoint_min(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_MetricHistoPoint_min(ctx, field) if err != nil { return graphql.Null } @@ -6438,24 +6435,21 @@ func (ec *executionContext) _MetricHistoPoint_count(ctx context.Context, field g }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { ctx = rctx // use context from middleware stack in children - return obj.Count, nil + return obj.Min, nil }) if err != nil { ec.Error(ctx, err) return graphql.Null } if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } return graphql.Null } - res := resTmp.(int) + res := resTmp.(*int) fc.Result = res - return ec.marshalNInt2int(ctx, field.Selections, res) + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_MetricHistoPoint_count(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_MetricHistoPoint_min(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "MetricHistoPoint", Field: field, @@ -6468,8 +6462,8 @@ func (ec *executionContext) fieldContext_MetricHistoPoint_count(ctx context.Cont return fc, nil } -func (ec *executionContext) _MetricHistoPoint_bin(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_MetricHistoPoint_bin(ctx, field) +func (ec *executionContext) _MetricHistoPoint_max(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoint) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_MetricHistoPoint_max(ctx, field) if err != nil { return graphql.Null } @@ -6482,24 +6476,21 @@ func (ec *executionContext) _MetricHistoPoint_bin(ctx context.Context, field gra }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { ctx = rctx // use context from middleware stack in children - return obj.Bin, nil + return obj.Max, nil }) if err != nil { ec.Error(ctx, err) return graphql.Null } if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } return graphql.Null } - res := resTmp.(int) + res := resTmp.(*int) fc.Result = res - return ec.marshalNInt2int(ctx, field.Selections, res) + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_MetricHistoPoint_bin(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_MetricHistoPoint_max(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "MetricHistoPoint", Field: field, @@ -6636,14 +6627,14 @@ func (ec *executionContext) fieldContext_MetricHistoPoints_data(ctx context.Cont IsResolver: false, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { switch field.Name { + case "bin": + return ec.fieldContext_MetricHistoPoint_bin(ctx, field) + case "count": + return ec.fieldContext_MetricHistoPoint_count(ctx, field) case "min": return ec.fieldContext_MetricHistoPoint_min(ctx, field) case "max": return ec.fieldContext_MetricHistoPoint_max(ctx, field) - case "count": - return ec.fieldContext_MetricHistoPoint_count(ctx, field) - case "bin": - return ec.fieldContext_MetricHistoPoint_bin(ctx, field) } return nil, fmt.Errorf("no field named %q was found under type MetricHistoPoint", field.Name) }, @@ -13542,26 +13533,17 @@ func (ec *executionContext) _MetricHistoPoint(ctx context.Context, sel ast.Selec switch field.Name { case "__typename": out.Values[i] = graphql.MarshalString("MetricHistoPoint") - case "min": - out.Values[i] = ec._MetricHistoPoint_min(ctx, field, obj) - if out.Values[i] == graphql.Null { - out.Invalids++ - } - case "max": - out.Values[i] = ec._MetricHistoPoint_max(ctx, field, obj) - if out.Values[i] == graphql.Null { - out.Invalids++ - } - case "count": - out.Values[i] = ec._MetricHistoPoint_count(ctx, field, obj) - if out.Values[i] == graphql.Null { - out.Invalids++ - } case "bin": out.Values[i] = ec._MetricHistoPoint_bin(ctx, field, obj) + case "count": + out.Values[i] = ec._MetricHistoPoint_count(ctx, field, obj) if out.Values[i] == graphql.Null { out.Invalids++ } + case "min": + out.Values[i] = ec._MetricHistoPoint_min(ctx, field, obj) + case "max": + out.Values[i] = ec._MetricHistoPoint_max(ctx, field, obj) default: panic("unknown field " + strconv.Quote(field.Name)) } diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index eb35bda2..7b8ebd21 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -110,10 +110,10 @@ type MetricFootprints struct { } type MetricHistoPoint struct { - Min int `json:"min"` - Max int `json:"max"` - Count int `json:"count"` - Bin int `json:"bin"` + Bin *int `json:"bin,omitempty"` + Count int `json:"count"` + Min *int `json:"min,omitempty"` + Max *int `json:"max,omitempty"` } type MetricHistoPoints struct { diff --git a/internal/repository/query.go b/internal/repository/query.go index 84b80481..317302bc 100644 --- a/internal/repository/query.go +++ b/internal/repository/query.go @@ -96,7 +96,7 @@ func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilde user := GetUserFromContext(ctx) if user == nil { var qnil sq.SelectBuilder - return qnil, fmt.Errorf("user context is nil!") + return qnil, fmt.Errorf("user context is nil") } else if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleApi}) { // Admin & Co. : All jobs return query, nil } else if user.HasRole(schema.RoleManager) { // Manager : Add filter for managed projects' jobs only + personal jobs diff --git a/internal/repository/stats.go b/internal/repository/stats.go index bd870a45..ab70427e 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -460,13 +460,8 @@ func (r *JobRepository) AddMetricHistograms( stat *model.JobsStatistics) (*model.JobsStatistics, error) { start := time.Now() - for i, m := range metrics { - // DEBUG - fmt.Println(i, m) - var err error - var metricHisto *model.MetricHistoPoints - - metricHisto, err = r.jobsMetricStatisticsHistogram(ctx, m, filter) + for _, m := range metrics { + metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter) if err != nil { log.Warnf("Error while loading job metric statistics histogram: %s", m) continue @@ -529,6 +524,12 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( dbMetric = "flops_any_avg" case "mem_bw": dbMetric = "mem_bw_avg" + case "mem_used": + dbMetric = "mem_used_max" + case "net_bw": + dbMetric = "net_bw_avg" + case "file_bw": + dbMetric = "file_bw_avg" default: return nil, fmt.Errorf("%s not implemented", metric) } @@ -562,46 +563,67 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } + // log.Debugf("Metric %s: DB %s, Peak %f, Unit %s", metric, dbMetric, peak, unit) // Make bins, see https://jereze.com/code/sql-histogram/ - // Diffs: - // CAST(X AS INTEGER) instead of floor(X), used also for for Min , Max selection - // renamed to bin for simplicity and model struct - // Ditched rename from job to data, as it conflicts with security check afterwards + start := time.Now() - prepQuery := sq.Select( - fmt.Sprintf(`CAST(min(job.%s) as INTEGER) as min`, dbMetric), - fmt.Sprintf(`CAST(max(job.%s) as INTEGER) as max`, dbMetric), + + crossJoinQuery := sq.Select( + fmt.Sprintf(`max(%s) as max`, dbMetric), + fmt.Sprintf(`min(%s) as min`, dbMetric), + ).From("job").Where( + fmt.Sprintf(`%s is not null`, dbMetric), + ).Where( + fmt.Sprintf(`%s <= %f`, dbMetric, peak), + ) + + crossJoinQuery, cjqerr := SecurityCheck(ctx, crossJoinQuery) + if cjqerr != nil { + return nil, cjqerr + } + + crossJoinQuerySql, _, sqlerr := crossJoinQuery.ToSql() + if sqlerr != nil { + return nil, sqlerr + } + + bins := 10 + binQuery := fmt.Sprintf(`CAST( (case when job.%s = value.max then value.max*0.999999999 else job.%s end - value.min) / (value.max - value.min) * %d as INTEGER )`, dbMetric, dbMetric, bins) + + mainQuery := sq.Select( + fmt.Sprintf(`%s + 1 as bin`, binQuery), fmt.Sprintf(`count(job.%s) as count`, dbMetric), - fmt.Sprintf(`CAST((case when job.%s = value.max then value.max*0.999999999 else job.%s end - value.min) / (value.max - value.min) * 10 as INTEGER) +1 as bin`, dbMetric, dbMetric)) - prepQuery = prepQuery.From("job") - prepQuery = prepQuery.CrossJoin(fmt.Sprintf(`(select max(%s) as max, min(%s) as min from job where %s is not null and %s < %f) as value`, dbMetric, dbMetric, dbMetric, dbMetric, peak)) - prepQuery = prepQuery.Where(fmt.Sprintf(`job.%s is not null and job.%s < %f`, dbMetric, dbMetric, peak)) + fmt.Sprintf(`CAST(((value.max / %d) * (%s )) as INTEGER ) as min`, bins, binQuery), + fmt.Sprintf(`CAST(((value.max / %d) * (%s + 1 )) as INTEGER ) as max`, bins, binQuery), + ).From("job").CrossJoin( + fmt.Sprintf(`(%s) as value`, crossJoinQuerySql), + ).Where(fmt.Sprintf(`job.%s is not null and job.%s <= %f`, dbMetric, dbMetric, peak)) - query, qerr := SecurityCheck(ctx, prepQuery) + mainQuery, qerr := SecurityCheck(ctx, mainQuery) if qerr != nil { return nil, qerr } for _, f := range filters { - query = BuildWhereClause(f, query) + mainQuery = BuildWhereClause(f, mainQuery) } // Finalize query with Grouping and Ordering - query = query.GroupBy("bin").OrderBy("bin") + mainQuery = mainQuery.GroupBy("bin").OrderBy("bin") - rows, err := query.RunWith(r.DB).Query() + rows, err := mainQuery.RunWith(r.DB).Query() if err != nil { - log.Errorf("Error while running query: %s", err) + log.Errorf("Error while running mainQuery: %s", err) return nil, err } points := make([]*model.MetricHistoPoint, 0) for rows.Next() { point := model.MetricHistoPoint{} - if err := rows.Scan(&point.Min, &point.Max, &point.Count, &point.Bin); err != nil { - log.Warn("Error while scanning rows") - return nil, err + if err := rows.Scan(&point.Bin, &point.Count, &point.Min, &point.Max); err != nil { + log.Warnf("Error while scanning rows for %s", metric) + return nil, err // Totally bricks cc-backend if returned and if all metrics requested? } points = append(points, &point) diff --git a/web/frontend/src/HistogramSelection.svelte b/web/frontend/src/HistogramSelection.svelte index afef8c70..142f6789 100644 --- a/web/frontend/src/HistogramSelection.svelte +++ b/web/frontend/src/HistogramSelection.svelte @@ -4,10 +4,10 @@ import { gql, getContextClient , mutationStore } from '@urql/svelte' export let cluster - export let availableMetrics = ['cpu_load', 'flops_any', 'mem_bw'] export let metricsInHistograms export let isOpen + let availableMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', 'net_bw', 'file_bw'] let pendingMetrics = [...metricsInHistograms] // Copy const client = getContextClient() diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index e216aa6f..a26c1aa6 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -44,7 +44,7 @@ histNumNodes { count, value } histMetrics { metric, unit, data { min, max, count, bin } } }}`, - variables: { jobFilters, metricsInHistograms} + variables: { jobFilters, metricsInHistograms } }) onMount(() => filterComponent.update()) diff --git a/web/frontend/src/utils.js b/web/frontend/src/utils.js index 537ad3fc..794a23a3 100644 --- a/web/frontend/src/utils.js +++ b/web/frontend/src/utils.js @@ -316,16 +316,18 @@ export function checkMetricDisabled(m, c, s) { //[m]etric, [c]luster, [s]ubclust } export function convert2uplot(canvasData) { - // initial use: Canvas Histogram Data to Uplot + // Prep: Uplot Data Structure let uplotData = [[],[]] // [X, Y1, Y2, ...] + // MetricHisto Only: Check if 1st bin not-null -> Set 0-Value bin for scaling + // Else: Only Single 0-Value bin returned -> No reset required + if (canvasData[0]?.bin) { + uplotData[0].push(0) + uplotData[1].push(0) + } + // Iterate canvasData.forEach( cd => { - if (cd.bin) { // MetricHisto Datafromat - // Force Zero Entry for scaling - if (uplotData[0].length == 0) { - uplotData[0].push(0) - uplotData[1].push(0) - } - uplotData[0].push(cd.max) + if (Object.keys(cd).length == 4) { // MetricHisto Datafromat + uplotData[0].push(cd?.max ? cd.max : 0) uplotData[1].push(cd.count) } else { // Default uplotData[0].push(cd.value) From ee4097a2ddcbd3459da03724777a8de57970eb75 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 11 Dec 2023 13:55:56 +0100 Subject: [PATCH 37/47] Add missing filters to crossjoinquery --- internal/repository/stats.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index ab70427e..b5813b9e 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -578,10 +578,15 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( ) crossJoinQuery, cjqerr := SecurityCheck(ctx, crossJoinQuery) + if cjqerr != nil { return nil, cjqerr } + for _, f := range filters { + crossJoinQuery = BuildWhereClause(f, crossJoinQuery) + } + crossJoinQuerySql, _, sqlerr := crossJoinQuery.ToSql() if sqlerr != nil { return nil, sqlerr From 119637cb9bc8e857a1cc2ca0d08ee1c385a7e99c Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 12 Dec 2023 15:07:23 +0100 Subject: [PATCH 38/47] Fix using crossjoin arguments not used --- internal/repository/stats.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index b5813b9e..3ac04901 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -587,7 +587,7 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( crossJoinQuery = BuildWhereClause(f, crossJoinQuery) } - crossJoinQuerySql, _, sqlerr := crossJoinQuery.ToSql() + crossJoinQuerySql, crossJoinQueryArgs, sqlerr := crossJoinQuery.ToSql() if sqlerr != nil { return nil, sqlerr } @@ -601,7 +601,7 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( fmt.Sprintf(`CAST(((value.max / %d) * (%s )) as INTEGER ) as min`, bins, binQuery), fmt.Sprintf(`CAST(((value.max / %d) * (%s + 1 )) as INTEGER ) as max`, bins, binQuery), ).From("job").CrossJoin( - fmt.Sprintf(`(%s) as value`, crossJoinQuerySql), + fmt.Sprintf(`(%s) as value`, crossJoinQuerySql), crossJoinQueryArgs..., ).Where(fmt.Sprintf(`job.%s is not null and job.%s <= %f`, dbMetric, dbMetric, peak)) mainQuery, qerr := SecurityCheck(ctx, mainQuery) From ee6d286cd78b16309e237b50e68d52d3db9a0965 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 12 Dec 2023 15:42:14 +0100 Subject: [PATCH 39/47] Small corrections --- web/frontend/src/HistogramSelection.svelte | 5 ++--- web/frontend/src/User.root.svelte | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/web/frontend/src/HistogramSelection.svelte b/web/frontend/src/HistogramSelection.svelte index 142f6789..00f558a2 100644 --- a/web/frontend/src/HistogramSelection.svelte +++ b/web/frontend/src/HistogramSelection.svelte @@ -35,13 +35,11 @@ function closeAndApply() { metricsInHistograms = [...pendingMetrics] // Set for parent - + isOpen = !isOpen updateConfiguration({ name: cluster ? `user_view_histogramMetrics:${cluster}` : 'user_view_histogramMetrics', value: metricsInHistograms }) - - isOpen = false } @@ -62,5 +60,6 @@ + diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index a26c1aa6..5d9c597b 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -224,7 +224,7 @@ bind:metrics={metrics} bind:isOpen={isMetricsSelectionOpen} /> - \ No newline at end of file From 07073e290a6f02e00929a285fe0080a76e51e2eb Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 12 Dec 2023 16:46:03 +0100 Subject: [PATCH 40/47] feat: add selectable histograms to status view --- internal/repository/stats.go | 113 ++++++++++++++++++++++++++++ web/frontend/src/Status.root.svelte | 59 ++++++++++++++- 2 files changed, 169 insertions(+), 3 deletions(-) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 3ac04901..4d7be089 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -8,10 +8,12 @@ import ( "context" "database/sql" "fmt" + "math" "time" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/model" + "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -460,6 +462,18 @@ func (r *JobRepository) AddMetricHistograms( stat *model.JobsStatistics) (*model.JobsStatistics, error) { start := time.Now() + // Running Jobs Only: First query jobdata from sqlite, then query data and make bins + for _, f := range filter { + if f.State != nil { + if len(f.State) == 1 && f.State[0] == "running" { + stat.HistMetrics = r.runningJobsMetricStatisticsHistogram(ctx, metrics, filter) + log.Debugf("Timer AddMetricHistograms %s", time.Since(start)) + return stat, nil + } + } + } + + // All other cases: Query and make bins in sqlite directly for _, m := range metrics { metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter) if err != nil { @@ -639,3 +653,102 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start)) return &result, nil } + +func (r *JobRepository) runningJobsMetricStatisticsHistogram( + ctx context.Context, + metrics []string, + filters []*model.JobFilter) []*model.MetricHistoPoints { + + // Get Jobs + jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) + if err != nil { + log.Errorf("Error while querying jobs for footprint: %s", err) + return nil + } + if len(jobs) > 500 { + log.Errorf("too many jobs matched (max: %d)", 500) + return nil + } + + // Get AVGs from metric repo + avgs := make([][]schema.Float, len(metrics)) + for i := range avgs { + avgs[i] = make([]schema.Float, 0, len(jobs)) + } + + for _, job := range jobs { + if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed { + continue + } + + if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil { + log.Errorf("Error while loading averages for histogram: %s", err) + return nil + } + } + + // Iterate metrics to fill endresult + data := make([]*model.MetricHistoPoints, 0) + for idx, metric := range metrics { + // Get specific Peak or largest Peak + var metricConfig *schema.MetricConfig + var peak float64 = 0.0 + var unit string = "" + + for _, f := range filters { + if f.Cluster != nil { + metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric) + peak = metricConfig.Peak + unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base + log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric) + } + } + + if peak == 0.0 { + for _, c := range archive.Clusters { + for _, m := range c.MetricConfig { + if m.Name == metric { + if m.Peak > peak { + peak = m.Peak + } + if unit == "" { + unit = m.Unit.Prefix + m.Unit.Base + } + } + } + } + } + + // Make and fill bins + bins := 10.0 + peakBin := peak / bins + + points := make([]*model.MetricHistoPoint, 0) + for b := 0; b < 10; b++ { + count := 0 + bindex := b + 1 + bmin := math.Round(peakBin * float64(b)) + bmax := math.Round(peakBin * (float64(b) + 1.0)) + + // Iterate AVG values for indexed metric and count for bins + for _, val := range avgs[idx] { + if float64(val) >= bmin && float64(val) < bmax { + count += 1 + } + } + + bminint := int(bmin) + bmaxint := int(bmax) + + // Append Bin to Metric Result Array + point := model.MetricHistoPoint{Bin: &bindex, Count: count, Min: &bminint, Max: &bmaxint} + points = append(points, &point) + } + + // Append Metric Result Array to final results array + result := model.MetricHistoPoints{Metric: metric, Unit: unit, Data: points} + data = append(data, &result) + } + + return data +} diff --git a/web/frontend/src/Status.root.svelte b/web/frontend/src/Status.root.svelte index 563978dd..95fc98cf 100644 --- a/web/frontend/src/Status.root.svelte +++ b/web/frontend/src/Status.root.svelte @@ -15,6 +15,7 @@ Table, Progress, Icon, + Button } from "sveltestrap"; import { init, convert2uplot, transformPerNodeDataForRoofline } from "./utils.js"; import { scaleNumbers } from "./units.js"; @@ -24,6 +25,8 @@ getContextClient, mutationStore, } from "@urql/svelte"; + import PlotTable from './PlotTable.svelte' + import HistogramSelection from './HistogramSelection.svelte' const { query: initq } = init(); const ccconfig = getContext("cc-config"); @@ -63,7 +66,8 @@ option.key == ccconfig.status_view_selectedTopUserCategory ); - let metricsInHistograms = ccconfig[`status_view_histogramMetrics:${cluster}`] || ccconfig.status_view_histogramMetrics + let isHistogramSelectionOpen = false + $: metricsInHistograms = cluster ? ccconfig[`user_view_histogramMetrics:${cluster}`] : (ccconfig.user_view_histogramMetrics || []) const client = getContextClient(); $: mainQuery = queryStore({ @@ -75,6 +79,7 @@ $metrics: [String!] $from: Time! $to: Time! + $metricsInHistograms: [String!] ) { nodeMetrics( cluster: $cluster @@ -100,7 +105,7 @@ } } - stats: jobsStatistics(filter: $filter) { + stats: jobsStatistics(filter: $filter, metrics: $metricsInHistograms) { histDuration { count value @@ -117,6 +122,16 @@ count value } + histMetrics { + metric + unit + data { + min + max + count + bin + } + } } allocatedNodes(cluster: $cluster) { @@ -131,6 +146,7 @@ from: from.toISOString(), to: to.toISOString(), filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], + metricsInHistograms: metricsInHistograms }, }); @@ -313,7 +329,7 @@

    Current utilization of cluster "{cluster}"

    - + {#if $initq.fetching || $mainQuery.fetching} {:else if $initq.error} @@ -323,6 +339,13 @@ {/if} + + + { @@ -668,4 +691,34 @@ {/key} +
    + {#if metricsInHistograms} + + + {#key $mainQuery.data.stats[0].histMetrics} + + + + + {/key} + + + {/if} {/if} + + From b829a5aafeb3998c663e2afdf1b51286a8b19fa7 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 13 Dec 2023 11:58:14 +0100 Subject: [PATCH 41/47] Improve binned data histogram legends --- web/frontend/src/Analysis.root.svelte | 5 +++-- web/frontend/src/Status.root.svelte | 5 +++-- web/frontend/src/User.root.svelte | 5 +++-- web/frontend/src/plots/Histogram.svelte | 9 +++++++++ web/frontend/src/utils.js | 6 ------ 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/web/frontend/src/Analysis.root.svelte b/web/frontend/src/Analysis.root.svelte index aa4ae379..163d5115 100644 --- a/web/frontend/src/Analysis.root.svelte +++ b/web/frontend/src/Analysis.root.svelte @@ -389,9 +389,10 @@ diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 5d9c597b..ad08bc6b 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -192,9 +192,10 @@ diff --git a/web/frontend/src/plots/Histogram.svelte b/web/frontend/src/plots/Histogram.svelte index d3e1aaaf..499ea4fa 100644 --- a/web/frontend/src/plots/Histogram.svelte +++ b/web/frontend/src/plots/Histogram.svelte @@ -11,6 +11,7 @@ import { Card } from 'sveltestrap' export let data + export let usesBins = false export let width = 500 export let height = 300 export let title = '' @@ -160,6 +161,14 @@ series: [ { label: xunit !== '' ? xunit : null, + value: (u, ts, sidx, didx) => { + if (usesBins) { + const min = u.data[sidx][didx - 1] ? u.data[sidx][didx - 1] : 0 + const max = u.data[sidx][didx] + ts = min + ' - ' + max // narrow spaces + } + return ts + } }, Object.assign({ label: yunit !== '' ? yunit : null, diff --git a/web/frontend/src/utils.js b/web/frontend/src/utils.js index 794a23a3..53462086 100644 --- a/web/frontend/src/utils.js +++ b/web/frontend/src/utils.js @@ -318,12 +318,6 @@ export function checkMetricDisabled(m, c, s) { //[m]etric, [c]luster, [s]ubclust export function convert2uplot(canvasData) { // Prep: Uplot Data Structure let uplotData = [[],[]] // [X, Y1, Y2, ...] - // MetricHisto Only: Check if 1st bin not-null -> Set 0-Value bin for scaling - // Else: Only Single 0-Value bin returned -> No reset required - if (canvasData[0]?.bin) { - uplotData[0].push(0) - uplotData[1].push(0) - } // Iterate canvasData.forEach( cd => { if (Object.keys(cd).length == 4) { // MetricHisto Datafromat From 6818d1de62d3f289a16a64a97f957a238bceb82a Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 7 Feb 2024 13:26:13 +0100 Subject: [PATCH 42/47] Resolve pullrequest comments --- tools/archive-migration/job.go | 8 +- web/frontend/src/JobFootprint.svelte | 187 +++++++++------------------ 2 files changed, 64 insertions(+), 131 deletions(-) diff --git a/tools/archive-migration/job.go b/tools/archive-migration/job.go index 0dff4b42..cd54d6cc 100644 --- a/tools/archive-migration/job.go +++ b/tools/archive-migration/job.go @@ -52,10 +52,10 @@ type Job struct { BaseJob StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type - MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64 - FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64 - MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64 - LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64 + MemUsedMax float64 `json:"-" db:"mem_used_max"` // MemUsedMax as Float64 + FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"` // FlopsAnyAvg as Float64 + MemBwAvg float64 `json:"-" db:"mem_bw_avg"` // MemBwAvg as Float64 + LoadAvg float64 `json:"-" db:"load_avg"` // LoadAvg as Float64 NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64 NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64 FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64 diff --git a/web/frontend/src/JobFootprint.svelte b/web/frontend/src/JobFootprint.svelte index 20b03d69..04b03a37 100644 --- a/web/frontend/src/JobFootprint.svelte +++ b/web/frontend/src/JobFootprint.svelte @@ -50,137 +50,78 @@ // Unit const fmc = getContext('metrics')(job.cluster, fm) - let unit = null - if (fmc?.unit?.base) { - unit = fmc.unit.prefix + fmc.unit.base - } else { - unit = '' - } + let unit = '' + if (fmc?.unit?.base) unit = fmc.unit.prefix + fmc.unit.base // Threshold / -Differences const fmt = findJobThresholds(job, fmc, subclusterConfig) - const levelPeak = fm === 'flops_any' ? round((fmt.peak * 0.85), 0) - mv : fmt.peak - mv // Scale flops_any down - const levelNormal = fmt.normal - mv - const levelCaution = fmt.caution - mv - const levelAlert = fmt.alert - mv + if (fm === 'flops_any') fmt.peak = round((fmt.peak * 0.85), 0) // Define basic data const fmBase = { name: fm, unit: unit, avg: mv, - max: fm === 'flops_any' ? round((fmt.peak * 0.85), 0) : fmt.peak + max: fmt.peak } - // Collect - if (fm !== 'mem_used') { // Alert if usage is low, peak as maxmimum possible (scaled down for flops_any) - if (levelAlert > 0) { - return { - ...fmBase, - color: 'danger', - message: 'Metric strongly below common levels!', - impact: 3 - } - } else if (levelCaution > 0) { - return { - ...fmBase, - color: 'warning', - message: 'Metric below common levels', - impact: 2 - } - } else if (levelNormal > 0) { - return { - ...fmBase, - color: 'success', - message: 'Metric within common levels', - impact: 1 - } - } else if (levelPeak > 0) { - return { - ...fmBase, - color: 'info', - message: 'Metric performs better than common levels', - impact: 0 - } - } else { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger - if (fmBase.avg >= (1.5 * fmBase.max)) { - return { - ...fmBase, - color: 'secondary', - message: 'Metric average at least 50% above common peak value: Check data for artifacts!', - impact: -2 - } - } else if (fmBase.avg >= (1.05 * fmBase.max)) { - return { - ...fmBase, - color: 'secondary', - message: 'Metric average at least 5% above common peak value: Check data for artifacts', - impact: -1 - } - } else { - return { - ...fmBase, - color: 'info', - message: 'Metric performs better than common levels', - impact: 0 - } - } + if (evalFootprint(fm, mv, fmt, 'alert')) { + return { + ...fmBase, + color: 'danger', + message:`Metric average way ${fm === 'mem_used' ? 'above' : 'below' } expected normal thresholds.`, + impact: 3 } - } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution - if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger - if (fmBase.avg >= (1.5 * fmBase.max)) { - return { - ...fmBase, - color: 'secondary', - message: 'Memory usage at least 50% above possible maximum value: Check data for artifacts!', - impact: -2 - } - } else if (fmBase.avg >= (1.05 * fmBase.max)) { - return { - ...fmBase, - color: 'secondary', - message: 'Memory usage at least 5% above possible maximum value: Check data for artifacts!', - impact: -1 - } - } else { - return { - ...fmBase, - color: 'danger', - message: 'Memory usage extremely above common levels!', - impact: 4 - } - } - } else if (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { - return { - ...fmBase, - color: 'danger', - message: 'Memory usage extremely above common levels!', - impact: 4 - } - } else if (levelAlert > 0 && (levelCaution <= 0 && levelNormal <= 0)) { - return { - ...fmBase, - color: 'danger', - message: 'Memory usage strongly above common levels!', - impact: 3 - } - } else if (levelCaution > 0 && levelNormal <= 0) { - return { - ...fmBase, - color: 'warning', - message: 'Memory usage above common levels', - impact: 2 - } - } else { - return { - ...fmBase, - color: 'success', - message: 'Memory usage within common levels', - impact: 1 - } + } else if (evalFootprint(fm, mv, fmt, 'caution')) { + return { + ...fmBase, + color: 'warning', + message: `Metric average ${fm === 'mem_used' ? 'above' : 'below' } expected normal thresholds.`, + impact: 2 + } + } else if (evalFootprint(fm, mv, fmt, 'normal')) { + return { + ...fmBase, + color: 'success', + message: 'Metric average within expected thresholds.', + impact: 1 + } + } else if (evalFootprint(fm, mv, fmt, 'peak')) { + return { + ...fmBase, + color: 'info', + message: 'Metric average above expected normal thresholds: Check for artifacts recommended.', + impact: 0 + } + } else { + return { + ...fmBase, + color: 'secondary', + message: 'Metric average above expected peak threshold: Check for artifacts!', + impact: -1 } } }) + + function evalFootprint(metric, mean, thresholds, level) { + // mem_used has inverse logic regarding threshold levels + switch (level) { + case 'peak': + if (metric === 'mem_used') return (mean <= thresholds.peak && mean > thresholds.alert) + else return (mean <= thresholds.peak && mean > thresholds.normal) + case 'alert': + if (metric === 'mem_used') return (mean <= thresholds.alert && mean > thresholds.caution) + else return (mean <= thresholds.alert && mean > 0) + case 'caution': + if (metric === 'mem_used') return (mean <= thresholds.caution && mean > thresholds.normal) + else return (mean <= thresholds.caution && mean > thresholds.alert) + case 'normal': + if (metric === 'mem_used') return (mean <= thresholds.normal && mean > 0) + else return (mean <= thresholds.normal && mean > thresholds.caution) + default: + return false + } + }