Merge pull request JeffersonLab#181 from JeffersonLab/fit_benchmark

Scripts for testing AmpTools fit speed with many CPUs or GPUs
SuSchadmand · Jan 24, 2022 · f4a652f · f4a652f
2 parents 36dcdc6 + b4b26d3
commit f4a652f
Show file tree

Hide file tree

Showing 4 changed files with 287 additions and 0 deletions.
diff --git a/PWA_scripts/benchmark/README.md b/PWA_scripts/benchmark/README.md
@@ -0,0 +1,14 @@
+# Introduction 
+This directory contains scripts for submitting batch jobs with MPI (and GPU).  They are meant to determine how your fit speed improves (or not) by adding additioinal resources
+* submit.py -- submits MPI jobs with various # of cores
+* submitGPU.py -- submits MPI+GPU jobs with various # of GPUs
+* plotBenchmark.C -- plots fit speed for the benchmark jobs
+
+# Required user modifications
+* In submit.py and submitGPU.py you should replace the MyEnv, MyConfig and MyOutDir variables with your own environment setup script, AmpTools fit configuration and output directory location
+* You can change the MyCPU or MyGPU list to contain different amounts of cores for your benchmark if you want to test with more or fewer cores/GPUs
+
+# Notes:
+* These fits require using the MPI compiled version of AmpTools and halld_sim, see https://halldweb.jlab.org/wiki/index.php/HOWTO_use_AmpTools_on_the_JLab_farm_with_MPI for more details
+* Only run the GPU version of the fitter if your fit utilizes a GPU accelerated amplitude and you've compiled that amplitude with the GPU (CUDA) libraries on one of the sciml nodes, see https://halldweb.jlab.org/wiki/index.php/HOWTO_use_AmpTools_on_the_JLab_farm_GPUs for more details
+* Some of these default benchmarks require many CPUs or GPUs and may take some time for those nodes to become available on the ifarm/sciml nodes, so be patient.
diff --git a/PWA_scripts/benchmark/plot_benchmark.C b/PWA_scripts/benchmark/plot_benchmark.C
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <iomanip>
+#include <stdio.h>
+#include <bits/stdc++.h>
+#include <string>
+#include <sstream>
+
+void plot_benchmark(TString dir = "./") {
+
+	gStyle->SetOptStat(0);
+
+	// initialize list of nCores to plot
+	vector<int> numThreadsCPU = {1,2,4,8,16,32,64,96,128};
+	int numTestCPU = numThreadsCPU.size();
+
+	// for GPU fits, only add if desired
+	vector<int> numThreadsGPUT4 = {1,2,3,4,6,8,10,12};
+	vector<int> numThreadsGPURTX = {};
+
+	// names of directories containing benchmark results
+	vector<TString> types = {"cpu"};
+	vector<TGraphErrors*> grBenchmarkScan;
+	if(numThreadsGPUT4.size() > 0) types.push_back("gpuT4");
+	if(numThreadsGPURTX.size() > 0) types.push_back("gpuTitanRTX");
+
+	TH1F *hBenchmarkScan = new TH1F("hBenchmarkScan","; Number of GPUs or CPUs; Fit speed (Likelihood function call rate [Hz])", 200, 0, 200);
+	double maxRate = 0;
+
+	// loop over different achitecture types to plot results
+	for(int itype=0; itype<types.size(); itype++) {
+		vector<int> numThreads = numThreadsCPU;
+		if(types[itype] == "gpuT4") numThreads = numThreadsGPUT4;
+		if(types[itype] == "gpuTitanRTX") numThreads = numThreadsGPURTX;
+		grBenchmarkScan.push_back(new TGraphErrors(numThreads.size()));
+
+		// loop over number of threads in test
+		for(int ithread=0; ithread<numThreads.size(); ithread++) {
+
+			int nThreads = numThreads[ithread];
+			string spath = Form("%s/%s%03d/log/fit.out", dir.Data(), types[itype].Data(), nThreads);	
+			//cout << spath << endl;	
+
+			std::string read_line;
+			ifstream file(spath);
+			double parValue = 0;
+			double parAvg = 0;
+			vector<double> parSq;
+			int nValues = 0;
+			while (std::getline(file, read_line)) {
+
+				TString line = read_line;
+				if(line.Contains("time ")) {
+					line.ReplaceAll("average time per function call:  ","");
+					line.ReplaceAll(" ms.","");
+					parValue = 1./(atof(line)/1000);
+					parAvg += parValue;
+					parSq.push_back(parValue*parValue);
+					nValues++;
+				}
+				else continue;
+
+			}
+
+			if(nValues > 0) {
+				parAvg /= float(nValues);
+				double parRms = 0;
+				for(uint ip=0; ip<parSq.size(); ip++)
+					parRms += (parSq.at(ip) + parAvg*parAvg - 2*sqrt(parSq.at(ip))*parAvg);
+				parRms /= float(nValues);
+				parRms = sqrt(parRms);
+				if(parAvg > maxRate) maxRate = parAvg;
+				//cout<<parAvg<<" "<<parRms<<endl;
+				if(parRms < 1e-9) parRms = 0.01;
+				grBenchmarkScan[itype]->SetPoint(ithread, nThreads, parAvg);
+				grBenchmarkScan[itype]->SetPointError(ithread, 0, parRms);
+			}
+		}
+	}
+
+	TCanvas *cc = new TCanvas("cc","cc",800,400);
+	auto legend = new TLegend(0.47,0.17,0.9,0.42);
+
+	hBenchmarkScan->SetMaximum(maxRate*2.5);
+	hBenchmarkScan->SetMinimum(0.1);
+	hBenchmarkScan->Draw();
+	vector<TF1*> fit;
+	for(int itype=0; itype<types.size(); itype++) {
+		grBenchmarkScan[itype]->SetMarkerStyle(20);
+		grBenchmarkScan[itype]->SetMarkerColor(kBlack+itype);
+		grBenchmarkScan[itype]->Draw("same pl");
+
+		fit.push_back(new TF1(types[itype],"pol1",1,200)); 
+		fit[itype]->FixParameter(0,0);
+		grBenchmarkScan[itype]->Fit(fit[itype],"N","",0.5,2);
+		fit[itype]->SetLineColor(kBlack+itype); fit[itype]->SetLineStyle(kDashed);
+		fit[itype]->Draw("same");
+
+		if(itype==0) 
+			legend->AddEntry(grBenchmarkScan[0],"ifarm19 CPU (2 thread/core)","pl");
+		if(types[itype] == "gpuT4") 
+			legend->AddEntry(grBenchmarkScan[itype],"sciml21 T4 GPU","pl");
+		if(types[itype] == "gpuTitanRTX") 
+			legend->AddEntry(grBenchmarkScan[itype],"sciml19 Titan RTX GPU","pl");
+	}
+
+	gPad->SetLeftMargin(0.09);
+	gPad->SetBottomMargin(0.15);
+	gPad->SetTopMargin(0.05);
+	gPad->SetRightMargin(0.05);
+	gPad->SetLogx(); gPad->SetLogy();
+	gPad->SetGridy(); gPad->SetGridx();
+	hBenchmarkScan->GetXaxis()->SetTitleSize(0.05);
+	hBenchmarkScan->GetYaxis()->SetTitleSize(0.05);
+	hBenchmarkScan->GetXaxis()->SetTitleOffset(1.3);
+	hBenchmarkScan->GetYaxis()->SetTitleOffset(0.8);
+
+	legend->SetFillColor(0);
+	legend->Draw();
+
+	cc->Print("benchmark.png");
+
+	return;
+}
diff --git a/PWA_scripts/benchmark/submit.py b/PWA_scripts/benchmark/submit.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import subprocess
+import math
+import pwd
+from optparse import OptionParser
+
+########################################################## MAIN ##########################################################
+def main(argv):
+
+	# SLURM INFO (see options at https://scicomp.jlab.org/scicomp/slurmJob/slurmInfo)
+	PARTITION    = "ifarm"   
+        CONSTRAINT   = "farm19"   
+        TIMELIMIT  = "24:00:00"   # Max walltime
+        MyCPUs = [1,2,4,8,16,32,64,96,128,192] # List of CPU cores to use in benchmark fits
+
+        # User provided environment, fit configuration and options
+	MyEnv = "/work/halld2/home/jrsteven/analysisGluexI/builds/setup_gluex_scanParam.csh"
+        MyConfig = "/work/halld2/home/jrsteven/forBenchmark/benchmark.cfg"
+        MyMPIOpt = "--mca btl_openib_allow_ib 1"
+        MyFitOpt = "-m 100000 -r 5"
+	MyOutDir = "/volatile/halld/home/" + pwd.getpwuid( os.getuid() )[0] + "/benchmark/"
+
+        # LOOP OVER # OF CORES FOR BENCHMARK
+        for nCores in MyCPUs:
+                # nodes used in fit (for every 64 CPUs allow an additional node)
+                nNodes = nCores/64 + 1
+
+                # create output directories
+                MyRunningDir = MyOutDir + "cpu%03d" % nCores
+                MyLogOutDir = MyRunningDir + "/log"
+                if not os.path.exists(MyOutDir):
+                        os.makedirs(MyOutDir)
+                if not os.path.exists(MyRunningDir):
+                        os.makedirs(MyRunningDir)
+                if not os.path.exists(MyLogOutDir):
+                        os.makedirs(MyLogOutDir)
+
+                # create slurm submission script
+                slurmOut = open("tempSlurm.txt",'w')
+                slurmOut.write("#!/bin/csh \n")
+                slurmOut.write("#SBATCH --nodes=%d \n" % nNodes)
+                slurmOut.write("#SBATCH --partition=%s \n" % PARTITION) 
+                slurmOut.write("#SBATCH --constraint=%s \n" % CONSTRAINT)
+                slurmOut.write("#SBATCH --cpus-per-task=1 \n")
+                slurmOut.write("#SBATCH --ntasks-per-core=1 \n")
+                slurmOut.write("#SBATCH --threads-per-core=1 \n")
+                slurmOut.write("#SBATCH --mem=%dGB \n" % nCores) # 1 GB per core
+                slurmOut.write("#SBATCH --time=%s \n" % TIMELIMIT)
+                slurmOut.write("#SBATCH --ntasks=%d \n" % (nCores+1))
+
+                slurmOut.write("#SBATCH --chdir=%s \n" % MyRunningDir)
+                slurmOut.write("#SBATCH --error=%s/fit.err \n" % (MyLogOutDir))
+                slurmOut.write("#SBATCH --output=%s/fit.out \n" % (MyLogOutDir))
+                slurmOut.write("#SBATCH --job-name=benchfit_%03d \n\n\n" % nCores)
+
+                # commands to execute during job
+                slurmOut.write("pwd \n")
+                slurmOut.write("source %s \n" % MyEnv)
+                slurmOut.write("mpirun %s fitMPI -c %s %s \n" % (MyMPIOpt, MyConfig, MyFitOpt))
+                slurmOut.close()
+
+                # submit individual job
+                print("Submitting %d core job on %d nodes" % (nCores, nNodes))
+                subprocess.call(["sbatch", "tempSlurm.txt"])
+                os.remove("tempSlurm.txt")
+
+
+if __name__ == "__main__":
+   main(sys.argv[1:])
diff --git a/PWA_scripts/benchmark/submitGPU.py b/PWA_scripts/benchmark/submitGPU.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import subprocess
+import math
+import pwd
+from optparse import OptionParser
+
+########################################################## MAIN ##########################################################
+def main(argv):
+
+	# SLURM INFO (see options at https://scicomp.jlab.org/scicomp/slurmJob/slurmInfo)
+	PARTITION  = "gpu"
+        GPUTYPE    = "T4"
+        TIMELIMIT  = "24:00:00"   # Max walltime
+        MyGPUs = [1,2,3,4]        # List of GPU cards to use in benchmark fits
+
+        # User provided environment, fit configuration and options
+	MyEnv = "/work/halld2/home/jrsteven/2021-amptools/builds_gpu/setup_gluex_dev.csh"
+        MyConfig = "/work/halld2/home/jrsteven/forBenchmark/benchmark.cfg"
+        MyMPIOpt = "--mca btl_openib_allow_ib 1"
+        MyFitOpt = "-m 100000 -r 5"
+	MyOutDir = "/volatile/halld/home/" + pwd.getpwuid( os.getuid() )[0] + "/benchmark/"
+
+        # LOOP OVER # OF GPUs FOR BENCHMARK
+        for nGPUs in MyGPUs:
+
+                # Two types of nodes/GPUs (sciml19 and sciml21), both with 3 each  
+                nNodes = 1
+                if GPUTYPE=="T4": # 16 allowed in a single job
+                        if nGPUs > 8: nNodes=2
+                if GPUTYPE=="TitanRTX": # 4 allowed in a single job
+                        if nGPUs > 4: nNodes=2
+                        if nGPUs > 8: nNodes=3
+
+                # create output directories
+                MyRunningDir = MyOutDir + "gpu%s%03d" % (GPUTYPE,nGPUs)
+                MyLogOutDir = MyRunningDir + "/log"
+                if not os.path.exists(MyOutDir):
+                        os.makedirs(MyOutDir)
+                if not os.path.exists(MyRunningDir):
+                        os.makedirs(MyRunningDir)
+                if not os.path.exists(MyLogOutDir):
+                        os.makedirs(MyLogOutDir)
+
+                # create slurm submission script
+                slurmOut = open("tempSlurm.txt",'w')
+                slurmOut.write("#!/bin/csh \n")
+                slurmOut.write("#SBATCH --nodes=%d \n" % nNodes)
+                slurmOut.write("#SBATCH --partition=%s \n" % PARTITION) 
+                slurmOut.write("#SBATCH --gres=gpu:%s:%d \n" % (GPUTYPE,nGPUs))
+                slurmOut.write("#SBATCH --cpus-per-task=1 \n")
+                slurmOut.write("#SBATCH --ntasks-per-core=1 \n")
+                slurmOut.write("#SBATCH --threads-per-core=1 \n")
+                slurmOut.write("#SBATCH --mem=20GB \n") # multiplied by nGPUs in slurm?
+                slurmOut.write("#SBATCH --time=%s \n" % TIMELIMIT)
+                slurmOut.write("#SBATCH --ntasks=%d \n" % (nGPUs+1))
+
+                slurmOut.write("#SBATCH --chdir=%s \n" % MyRunningDir)
+                slurmOut.write("#SBATCH --error=%s/fit.err \n" % (MyLogOutDir))
+                slurmOut.write("#SBATCH --output=%s/fit.out \n" % (MyLogOutDir))
+                slurmOut.write("#SBATCH --job-name=benchfitgpu_%03d \n\n\n" % nGPUs)
+
+                # commands to execute during job
+                slurmOut.write("pwd \n")
+                slurmOut.write("source %s \n" % MyEnv)
+                slurmOut.write("mpirun %s fitMPI -c %s %s \n" % (MyMPIOpt, MyConfig, MyFitOpt))
+                slurmOut.close()
+
+                # submit individual job
+                print("Submitting %d GPU job on %d %s nodes" % (nGPUs, nNodes, GPUTYPE))
+                subprocess.call(["sbatch", "tempSlurm.txt"])
+                os.remove("tempSlurm.txt")
+
+
+if __name__ == "__main__":
+   main(sys.argv[1:])