Skip to content

Commit

Permalink
Improved metrics widget code and templates
Browse files Browse the repository at this point in the history
  • Loading branch information
abujeda committed Nov 4, 2024
1 parent 5ecec75 commit 7d5ced8
Show file tree
Hide file tree
Showing 14 changed files with 249 additions and 188 deletions.
3 changes: 3 additions & 0 deletions config/local/.env
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ [email protected]
OOD_DASHBOARD_SUPPORT_URL=[email protected]
OOD_APP_CATALOG_URL=https://link.to.online/app/catalog
ENABLE_NATIVE_VNC=true
OOD_ANNOUNCEMENT_PATH=/var/www/ood/public/announcements
OOD_CONFIG_D_DIRECTORY_BAK=/etc/ood/config
OOD_LOAD_EXTERNAL_CONFIG=true
OOD_APP_CONFIG_ROOT=/etc/ood/config/apps/ood
OOD_DATAROOT=~/ondemand/data/sys/dashboard
OOD_XDMOD_HOST=https://localhost:4443
OOD_XDMOD_HOST_BAK=https://localhost:33000
55 changes: 0 additions & 55 deletions config/local/app_overrides/lib/slurm_metrics/metrics_cache.rb

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ module SlurmMetrics
# Based on: https://github.com/fasrc/puppet-slurm_stats
class MetricsProcessor

def calculate_metrics(user_metrics)
def calculate_metrics(from, to, user_metrics)
metrics_summary = SlurmMetrics::MetricsSummary.new
metrics_summary.from = from
metrics_summary.to = to

user_max_rss = 0.0
# REVERSE METRICS TO PROCESS FIRST THE JOB STEPS AND THEN THE MAIN JOB
user_metrics.reverse_each do |metric_hash|
Expand Down
79 changes: 79 additions & 0 deletions config/local/app_overrides/lib/slurm_metrics/metrics_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# frozen_string_literal: true

module SlurmMetrics
# Service to manages retrieving and storing the user metrics within the user settings store.
# It uses a 24hours cache for theSlurm data.
class MetricsService
include UserSettingStore
METRICS_PERIOD = 7.days

attr_reader :cluster

def initialize
@cluster = Configuration.job_clusters.select(&:slurm?).first
end

def read_metrics
slurm_metrics = user_settings.fetch(:slurm_metrics, {})
slurm_metrics = refresh_metrics if expired?(slurm_metrics[:timestamp])

SlurmMetrics::MetricsSummary.new(slurm_metrics[:metrics])
end

def read_fairshare
slurm_fairshare = user_settings.fetch(:slurm_fairshare, {})
slurm_fairshare = refresh_fairshare if expired?(slurm_fairshare[:timestamp])
fairshare = slurm_fairshare[:fairshare]
fairshare[:from] = Time.at(fairshare[:from])

OpenStruct.new(fairshare)
end

private

def refresh_metrics
from = Time.now - METRICS_PERIOD
to = Time.now
job_data = cluster.job_adapter.metrics(from: from.strftime('%Y-%m-%dT00:00:00'), to: to.strftime('%Y-%m-%dT23:59:59'))
processor = SlurmMetrics::MetricsProcessor.new
metrics_summary = processor.calculate_metrics(from, to, job_data)

set_metrics(metrics_summary)
end

def refresh_fairshare
data = cluster.job_adapter.fairshare
fair_share = {
from: Time.now.to_i,
data: data
}
set_fairshare(fair_share)
end

def set_metrics(metrics)
slurm_metrics = {
timestamp: Time.now.strftime('%Y-%m-%dT%H:%M:%S'),
metrics: metrics.to_hash
}
update_user_settings({ slurm_metrics: slurm_metrics })
slurm_metrics
end

def set_fairshare(fairshare)
slurm_fairshare = {
timestamp: Time.now.strftime('%Y-%m-%dT%H:%M:%S'),
fairshare: fairshare
}
update_user_settings({ slurm_fairshare: slurm_fairshare })
slurm_fairshare
end

def expired?(date_string)
return true if date_string.blank?

# Parse the date string and compare the time difference with 24 hours (in seconds)
Time.now - Time.parse(date_string) > 24 * 60 * 60
end

end
end
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
module SlurmMetrics
# Class that holds all the metrics data
class MetricsSummary
attr_accessor :from, :to
attr_accessor :num_jobs, :num_jgpu
attr_accessor :tot_cpu_walltime, :tot_gpu_hours, :ave_cpu_use, :ave_cpu_req, :ave_cpu_eff, :ave_gpu_req
attr_accessor :tot_mem_use, :ave_mem_use, :ave_mem_req, :ave_mem_eff
attr_accessor :tot_time_use, :ave_time_use, :ave_time_req, :ave_time_eff, :ave_wait_time
attr_accessor :ntotal_cpu, :nca_cpu, :ncd_cpu, :nf_cpu, :noom_cpu, :nto_cpu, :ntotal_gpu, :nca_gpu, :ncd_gpu, :nf_gpu, :noom_gpu, :nto_gpu

def initialize(data = {})
@from = data.fetch(:from, nil)
@to = data.fetch(:to, nil)
@from = Time.at(@from) unless @from.blank?
@to = Time.at(@to) unless @to.blank?

@num_jobs = data.fetch(:num_jobs, 0)
@num_jgpu = data.fetch(:num_jgpu, 0)

Expand Down Expand Up @@ -55,6 +61,8 @@ def to_hash
instance_variables.each do |var|
hash[var.to_s.delete('@').to_sym] = instance_variable_get(var)
end
hash[:from] = hash[:from].to_i
hash[:to] = hash[:to].to_i
hash
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -1,43 +1,44 @@
<%
metrics_cache = SlurmMetrics::MetricsCache.new
timestamp, fairshare = metrics_cache.get_fairshare
if true || fairshare.blank?
cluster = Configuration.job_clusters.select(&:slurm?).first
fairshare = cluster.job_adapter.fairshare
timestamp = Time.now
metrics_cache.set_fairshare(timestamp, fairshare)
end
metrics_service = SlurmMetrics::MetricsService.new
slurm_faishare = metrics_service.read_fairshare
timestamp = slurm_faishare.from
fairshare = slurm_faishare.data

metrics_helper = SlurmMetrics::MetricsHelper.new
%>
<div class="card mt-3">
<div class="card-header" title="Fairshare calculated at <%= timestamp.strftime('%Y-%m-%dT%H:%M:%S') %>">
<small class="float-end float-right"><%= timestamp.strftime('%Y-%m-%d') %></small>
<h3>Fairshare</h3>
</div>
<div class="card-body">
<% unless fairshare.empty? %>
<table class="table table-sm table-striped table-condensed metrics">
<thead>
<tr>
<th>Account <span class="float-end float-right">Fairshare</span></th>
</tr>
</thead>
<tbody>
<% fairshare.each do |fairshare_data| %>
<div class="metrics-widget-component">
<div class="card mt-3">
<div class="card-header" title="Fairshare calculated at <%= timestamp.strftime('%Y-%m-%dT%H:%M:%S') %>">
<small class="float-end float-right"><%= timestamp.strftime('%Y-%m-%d') %></small>
<h3>Fairshare</h3>
</div>
<div class="card-body">
<% unless fairshare.blank? %>
<table class="table table-sm table-striped table-condensed metrics">
<thead>
<tr>
<td>
<%= fairshare_data[:account] %>
<span class="badge <%= metrics_helper.fairshare_class(fairshare_data[:fairshare]) %> float-end float-right"><%= fairshare_data[:fairshare].to_s.to_f.round(4) %></span>
</td>
<th>Account <span class="float-end float-right">Fairshare</span></th>
</tr>
<% end %>
</tbody>
</table>
<% else %>
<div class="no-fairshare-data">
No fairshare data available for user <span><%= @user.name %></span>
</div>
<% end %>
</thead>
<tbody>
<% fairshare.each do |fairshare_data| %>
<tr>
<td title="<%= fairshare_data[:fairshare].to_s.to_f.round(4) %> fairshare for account <%= fairshare_data[:account] %>">
<%= fairshare_data[:account] %>
<span class="badge <%= metrics_helper.fairshare_class(fairshare_data[:fairshare]) %> float-end float-right"><%= fairshare_data[:fairshare].to_s.to_f.round(4) %></span>
</td>
</tr>
<% end %>
</tbody>
</table>
<% else %>
<p class="no-fairshare-data">
No fairshare data available for user <span><%= @user.name %></span>
</p>
<% end %>
<small class="float-end float-right metrics-note" title="FASRC information on how fairshare is calculated and it affects your jobs.">
<a target="_blank" href="https://docs.rc.fas.harvard.edu/kb/fairshare/">FASRC Fairshare info</a>
</small>
</div>
</div>
</div>
Original file line number Diff line number Diff line change
Expand Up @@ -28,41 +28,43 @@
metrics_helper.metrics_ceil(total, failed),
]
%>
<div class="card mt-3">
<div class="card-header">
<small class="float-end float-right">Total Jobs: <%= total %></small>
<h3><%= title %></small></h3>
<% if subtitle %>
<small><%= subtitle %></small>
<% end %>
</div>
<div class="card-body">
<table class="table table-sm table-striped table-condensed metrics">
<thead>
<tr>
<th title="Number of jobs that completed within the time allocated">Completed</th>
<th title="Number of jobs that timeout">Timed Out</th>
<th title="Number of jobs that were canceled">Canceled</th>
<th title="Number of jobs that run out of memory">OOM</th>
<th title="Number of jobs that failed during execution">Failed</th>
</tr>
</thead>
<tbody>
<tr>
<td title="Number of jobs that completed within the time allocated"><%= completed %></td>
<td title="Number of jobs that timeout"><%= timeout %></td>
<td title="Number of jobs that were canceled"><%= canceled %></td>
<td title="Number of jobs that run out of memory"><%= memory %></td>
<td title="Number of jobs that failed during execution"><%= failed %></td>
</tr>
</tbody>
</table>
<div class="progress metrics">
<div class="progress-bar metrics-completed" role="progressbar" style="width: <%= bar_values[0] %>%" aria-valuenow="<%= bar_values[0] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs completed <%= percentages[0] %>%"><%= percentages[0] %>%</div>
<div class="progress-bar metrics-timeout" role="progressbar" style="width: <%= bar_values[1] %>%" aria-valuenow="<%= bar_values[1] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs timeout <%= percentages[1] %>%"><%=percentages[1] %>%</div>
<div class="progress-bar metrics-canceled" role="progressbar" style="width: <%= bar_values[2] %>%" aria-valuenow="<%= bar_values[2] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs canceled <%= percentages[2] %>%"><%= percentages[2] %>%</div>
<div class="progress-bar metrics-memory" role="progressbar" style="width: <%= bar_values[3] %>%" aria-valuenow="<%= bar_values[3 ] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs out of memory <%= percentages[3] %>%"><%= percentages[3] %>%</div>
<div class="progress-bar metrics-failed" role="progressbar" style="width: <%= bar_values[4] %>%" aria-valuenow="<%= bar_values[4] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs failed <%= percentages[4] %>%"><%= percentages[4] %>%</div>
<div class="metrics-widget-component">
<div class="card mt-3">
<div class="card-header">
<small class="float-end float-right">Total Jobs: <%= total %></small>
<h3><%= title %></small></h3>
<% if subtitle %>
<small><%= subtitle %></small>
<% end %>
</div>
<div class="card-body">
<table class="table table-sm table-striped table-condensed metrics">
<thead>
<tr>
<th title="Number of jobs that completed within the time allocated">Completed</th>
<th title="Number of jobs that timeout">Timed Out</th>
<th title="Number of jobs that were canceled">Canceled</th>
<th title="Number of jobs that run out of memory">OOM</th>
<th title="Number of jobs that failed during execution">Failed</th>
</tr>
</thead>
<tbody>
<tr>
<td title="Number of jobs that completed within the time allocated"><%= completed %></td>
<td title="Number of jobs that timeout"><%= timeout %></td>
<td title="Number of jobs that were canceled"><%= canceled %></td>
<td title="Number of jobs that run out of memory"><%= memory %></td>
<td title="Number of jobs that failed during execution"><%= failed %></td>
</tr>
</tbody>
</table>
<div class="progress metrics">
<div class="progress-bar metrics-completed" role="progressbar" style="width: <%= bar_values[0] %>%" aria-valuenow="<%= bar_values[0] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs completed <%= percentages[0] %>%"><%= percentages[0] %>%</div>
<div class="progress-bar metrics-timeout" role="progressbar" style="width: <%= bar_values[1] %>%" aria-valuenow="<%= bar_values[1] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs timeout <%= percentages[1] %>%"><%=percentages[1] %>%</div>
<div class="progress-bar metrics-canceled" role="progressbar" style="width: <%= bar_values[2] %>%" aria-valuenow="<%= bar_values[2] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs canceled <%= percentages[2] %>%"><%= percentages[2] %>%</div>
<div class="progress-bar metrics-memory" role="progressbar" style="width: <%= bar_values[3] %>%" aria-valuenow="<%= bar_values[3 ] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs out of memory <%= percentages[3] %>%"><%= percentages[3] %>%</div>
<div class="progress-bar metrics-failed" role="progressbar" style="width: <%= bar_values[4] %>%" aria-valuenow="<%= bar_values[4] %>" aria-valuemin="0" aria-valuemax="100" title="Jobs failed <%= percentages[4] %>%"><%= percentages[4] %>%</div>
</div>
</div>
</div>
</div>
Loading

0 comments on commit 7d5ced8

Please sign in to comment.