From 51fdb1224ee6737a238dba8bf549457175523b6e Mon Sep 17 00:00:00 2001 From: Aday Bujeda Date: Wed, 13 Nov 2024 19:24:51 +0000 Subject: [PATCH] Added metrics information to session card --- Makefile | 1 + .../lib/slurm_metrics/metrics_service.rb | 30 +++++ .../sessions/card/_card_body.html.erb | 13 ++ .../card/_session_job_metrics.html.erb | 117 ++++++++++++++++++ 4 files changed, 161 insertions(+) create mode 100644 dev/metrics/views/batch_connect/sessions/card/_card_body.html.erb create mode 100644 dev/metrics/views/batch_connect/sessions/card/_session_job_metrics.html.erb diff --git a/Makefile b/Makefile index 0bbe513f..f594675c 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,7 @@ clean: rm -rf ./ondemand/apps/dashboard/node_modules rm -rf ./ondemand/apps/dashboard/vendor/bundle rm -rf ./ondemand/apps/dashboard/app_overrides + rm -rf ./ondemand/apps/dashboard/plugins rm -rf ./ondemand/apps/dashboard/.env* build_latest_ood: diff --git a/dev/metrics/lib/slurm_metrics/metrics_service.rb b/dev/metrics/lib/slurm_metrics/metrics_service.rb index 172b93df..338bc449 100644 --- a/dev/metrics/lib/slurm_metrics/metrics_service.rb +++ b/dev/metrics/lib/slurm_metrics/metrics_service.rb @@ -20,6 +20,21 @@ def read_metrics SlurmMetrics::MetricsSummary.new(slurm_metrics[:metrics]) end + def read_job_metrics(session) + file_path = job_metrics_path(session.id) + return refresh_job_metrics(session) unless file_path.exist? + + job_metrics = {} + begin + yml = YAML.safe_load(file_path.read) || {} + job_metrics = SlurmMetrics::MetricsSummary.new(yml.symbolize_keys) + rescue => e + Rails.logger.error("Can't read or parse job metrics: #{file_path} because of error #{e}") + end + + job_metrics + end + def read_fairshare slurm_fairshare = user_settings.fetch(:slurm_fairshare, {}) slurm_fairshare = refresh_fairshare if expired?(slurm_fairshare[:timestamp]) @@ -41,6 +56,17 @@ def refresh_metrics set_metrics(metrics_summary) end + def refresh_job_metrics(session) + job_data = cluster.job_adapter.metrics(job_ids: [session.job_id]) + Rails.logger.info(job_data) + processor = SlurmMetrics::MetricsProcessor.new + job_metrics = processor.calculate_metrics(Time.now, Time.now, job_data, ignore_cancelled: false) + Rails.logger.info(job_metrics.to_hash) + job_metrics_file = job_metrics_path(session.id) + job_metrics_file.write(job_metrics.to_hash.stringify_keys.to_yaml) + job_metrics + end + def refresh_fairshare data = cluster.job_adapter.fairshare fair_share = { @@ -75,5 +101,9 @@ def expired?(date_string) Time.now - Time.parse(date_string) > 24 * 60 * 60 end + def job_metrics_path(session_id) + BatchConnect::Session.dataroot.join('metrics').tap { |p| p.mkpath unless p.exist? }.join(session_id) + end + end end diff --git a/dev/metrics/views/batch_connect/sessions/card/_card_body.html.erb b/dev/metrics/views/batch_connect/sessions/card/_card_body.html.erb new file mode 100644 index 00000000..6ef2c8f9 --- /dev/null +++ b/dev/metrics/views/batch_connect/sessions/card/_card_body.html.erb @@ -0,0 +1,13 @@ +
+
+
<%= cancel_or_delete(session) %>
+ <%= render_card_partial('host', session) %> + <%= render_card_partial('created', session) %> + <%= render_card_partial('session_time', session) %> + <%= render_card_partial('id', session) %> + <%= render_card_partial('support_ticket', session) if Configuration.support_ticket_enabled? %> + <%= render_card_partial('display_choices', session) %> + <%= render_card_partial('session_job_metrics', session) if session.completed?%> + <%= render_connection(session) %> +
+
diff --git a/dev/metrics/views/batch_connect/sessions/card/_session_job_metrics.html.erb b/dev/metrics/views/batch_connect/sessions/card/_session_job_metrics.html.erb new file mode 100644 index 00000000..895a0d88 --- /dev/null +++ b/dev/metrics/views/batch_connect/sessions/card/_session_job_metrics.html.erb @@ -0,0 +1,117 @@ +<% + + metrics_service = SlurmMetrics::MetricsService.new + job_metrics = metrics_service.read_job_metrics(session) + metrics_helper = SlurmMetrics::MetricsHelper.new + metric_details_id = "#{session.id}_metric" +%> + + +
+
+ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricsEfficiencyAllocatedUsedOther
CPU<%= "#{job_metrics.ave_cpu_eff.ceil(2)}%" %><%= "#{job_metrics.ave_cpu_req.ceil(2)} CPU Hrs" %><%= "#{job_metrics.ave_cpu_use.ceil(2)} CPU Hrs" %>Walltime <%= "#{job_metrics.tot_cpu_walltime.ceil(2)} CPU Hrs" %>
Memory<%= "#{job_metrics.ave_mem_eff.ceil(2)}%" %><%= "#{job_metrics.ave_mem_req.ceil(2)}G" %><%= "#{job_metrics.ave_mem_use.ceil(2)}G" %>N/A
Time<%= "#{job_metrics.ave_time_eff.ceil(2)}%" %><%= metrics_helper.format_duration(job_metrics.ave_time_req) %><%= metrics_helper.format_duration(job_metrics.ave_time_use) %>Waiting Time <%= metrics_helper.format_duration(job_metrics.ave_wait_time) %>
GPUN/A<%= "#{job_metrics.ave_gpu_req.ceil(2)} GPU Hrs" %>N/AWalltime <%= "#{job_metrics.tot_gpu_hours.ceil(2)} GPU Hrs" %>
+ +
+
+ +
\ No newline at end of file