forked from JeffersonLab/hd_utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request JeffersonLab#181 from JeffersonLab/fit_benchmark
Scripts for testing AmpTools fit speed with many CPUs or GPUs
- Loading branch information
Showing
4 changed files
with
287 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Introduction | ||
This directory contains scripts for submitting batch jobs with MPI (and GPU). They are meant to determine how your fit speed improves (or not) by adding additioinal resources | ||
* submit.py -- submits MPI jobs with various # of cores | ||
* submitGPU.py -- submits MPI+GPU jobs with various # of GPUs | ||
* plotBenchmark.C -- plots fit speed for the benchmark jobs | ||
|
||
# Required user modifications | ||
* In submit.py and submitGPU.py you should replace the MyEnv, MyConfig and MyOutDir variables with your own environment setup script, AmpTools fit configuration and output directory location | ||
* You can change the MyCPU or MyGPU list to contain different amounts of cores for your benchmark if you want to test with more or fewer cores/GPUs | ||
|
||
# Notes: | ||
* These fits require using the MPI compiled version of AmpTools and halld_sim, see https://halldweb.jlab.org/wiki/index.php/HOWTO_use_AmpTools_on_the_JLab_farm_with_MPI for more details | ||
* Only run the GPU version of the fitter if your fit utilizes a GPU accelerated amplitude and you've compiled that amplitude with the GPU (CUDA) libraries on one of the sciml nodes, see https://halldweb.jlab.org/wiki/index.php/HOWTO_use_AmpTools_on_the_JLab_farm_GPUs for more details | ||
* Some of these default benchmarks require many CPUs or GPUs and may take some time for those nodes to become available on the ifarm/sciml nodes, so be patient. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#include <iostream> | ||
#include <iomanip> | ||
#include <stdio.h> | ||
#include <bits/stdc++.h> | ||
#include <string> | ||
#include <sstream> | ||
|
||
void plot_benchmark(TString dir = "./") { | ||
|
||
gStyle->SetOptStat(0); | ||
|
||
// initialize list of nCores to plot | ||
vector<int> numThreadsCPU = {1,2,4,8,16,32,64,96,128}; | ||
int numTestCPU = numThreadsCPU.size(); | ||
|
||
// for GPU fits, only add if desired | ||
vector<int> numThreadsGPUT4 = {1,2,3,4,6,8,10,12}; | ||
vector<int> numThreadsGPURTX = {}; | ||
|
||
// names of directories containing benchmark results | ||
vector<TString> types = {"cpu"}; | ||
vector<TGraphErrors*> grBenchmarkScan; | ||
if(numThreadsGPUT4.size() > 0) types.push_back("gpuT4"); | ||
if(numThreadsGPURTX.size() > 0) types.push_back("gpuTitanRTX"); | ||
|
||
TH1F *hBenchmarkScan = new TH1F("hBenchmarkScan","; Number of GPUs or CPUs; Fit speed (Likelihood function call rate [Hz])", 200, 0, 200); | ||
double maxRate = 0; | ||
|
||
// loop over different achitecture types to plot results | ||
for(int itype=0; itype<types.size(); itype++) { | ||
vector<int> numThreads = numThreadsCPU; | ||
if(types[itype] == "gpuT4") numThreads = numThreadsGPUT4; | ||
if(types[itype] == "gpuTitanRTX") numThreads = numThreadsGPURTX; | ||
grBenchmarkScan.push_back(new TGraphErrors(numThreads.size())); | ||
|
||
// loop over number of threads in test | ||
for(int ithread=0; ithread<numThreads.size(); ithread++) { | ||
|
||
int nThreads = numThreads[ithread]; | ||
string spath = Form("%s/%s%03d/log/fit.out", dir.Data(), types[itype].Data(), nThreads); | ||
//cout << spath << endl; | ||
|
||
std::string read_line; | ||
ifstream file(spath); | ||
double parValue = 0; | ||
double parAvg = 0; | ||
vector<double> parSq; | ||
int nValues = 0; | ||
while (std::getline(file, read_line)) { | ||
|
||
TString line = read_line; | ||
if(line.Contains("time ")) { | ||
line.ReplaceAll("average time per function call: ",""); | ||
line.ReplaceAll(" ms.",""); | ||
parValue = 1./(atof(line)/1000); | ||
parAvg += parValue; | ||
parSq.push_back(parValue*parValue); | ||
nValues++; | ||
} | ||
else continue; | ||
|
||
} | ||
|
||
if(nValues > 0) { | ||
parAvg /= float(nValues); | ||
double parRms = 0; | ||
for(uint ip=0; ip<parSq.size(); ip++) | ||
parRms += (parSq.at(ip) + parAvg*parAvg - 2*sqrt(parSq.at(ip))*parAvg); | ||
parRms /= float(nValues); | ||
parRms = sqrt(parRms); | ||
if(parAvg > maxRate) maxRate = parAvg; | ||
//cout<<parAvg<<" "<<parRms<<endl; | ||
if(parRms < 1e-9) parRms = 0.01; | ||
grBenchmarkScan[itype]->SetPoint(ithread, nThreads, parAvg); | ||
grBenchmarkScan[itype]->SetPointError(ithread, 0, parRms); | ||
} | ||
} | ||
} | ||
|
||
TCanvas *cc = new TCanvas("cc","cc",800,400); | ||
auto legend = new TLegend(0.47,0.17,0.9,0.42); | ||
|
||
hBenchmarkScan->SetMaximum(maxRate*2.5); | ||
hBenchmarkScan->SetMinimum(0.1); | ||
hBenchmarkScan->Draw(); | ||
vector<TF1*> fit; | ||
for(int itype=0; itype<types.size(); itype++) { | ||
grBenchmarkScan[itype]->SetMarkerStyle(20); | ||
grBenchmarkScan[itype]->SetMarkerColor(kBlack+itype); | ||
grBenchmarkScan[itype]->Draw("same pl"); | ||
|
||
fit.push_back(new TF1(types[itype],"pol1",1,200)); | ||
fit[itype]->FixParameter(0,0); | ||
grBenchmarkScan[itype]->Fit(fit[itype],"N","",0.5,2); | ||
fit[itype]->SetLineColor(kBlack+itype); fit[itype]->SetLineStyle(kDashed); | ||
fit[itype]->Draw("same"); | ||
|
||
if(itype==0) | ||
legend->AddEntry(grBenchmarkScan[0],"ifarm19 CPU (2 thread/core)","pl"); | ||
if(types[itype] == "gpuT4") | ||
legend->AddEntry(grBenchmarkScan[itype],"sciml21 T4 GPU","pl"); | ||
if(types[itype] == "gpuTitanRTX") | ||
legend->AddEntry(grBenchmarkScan[itype],"sciml19 Titan RTX GPU","pl"); | ||
} | ||
|
||
gPad->SetLeftMargin(0.09); | ||
gPad->SetBottomMargin(0.15); | ||
gPad->SetTopMargin(0.05); | ||
gPad->SetRightMargin(0.05); | ||
gPad->SetLogx(); gPad->SetLogy(); | ||
gPad->SetGridy(); gPad->SetGridx(); | ||
hBenchmarkScan->GetXaxis()->SetTitleSize(0.05); | ||
hBenchmarkScan->GetYaxis()->SetTitleSize(0.05); | ||
hBenchmarkScan->GetXaxis()->SetTitleOffset(1.3); | ||
hBenchmarkScan->GetYaxis()->SetTitleOffset(0.8); | ||
|
||
legend->SetFillColor(0); | ||
legend->Draw(); | ||
|
||
cc->Print("benchmark.png"); | ||
|
||
return; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys | ||
import os | ||
import subprocess | ||
import math | ||
import pwd | ||
from optparse import OptionParser | ||
|
||
########################################################## MAIN ########################################################## | ||
def main(argv): | ||
|
||
# SLURM INFO (see options at https://scicomp.jlab.org/scicomp/slurmJob/slurmInfo) | ||
PARTITION = "ifarm" | ||
CONSTRAINT = "farm19" | ||
TIMELIMIT = "24:00:00" # Max walltime | ||
MyCPUs = [1,2,4,8,16,32,64,96,128,192] # List of CPU cores to use in benchmark fits | ||
|
||
# User provided environment, fit configuration and options | ||
MyEnv = "/work/halld2/home/jrsteven/analysisGluexI/builds/setup_gluex_scanParam.csh" | ||
MyConfig = "/work/halld2/home/jrsteven/forBenchmark/benchmark.cfg" | ||
MyMPIOpt = "--mca btl_openib_allow_ib 1" | ||
MyFitOpt = "-m 100000 -r 5" | ||
MyOutDir = "/volatile/halld/home/" + pwd.getpwuid( os.getuid() )[0] + "/benchmark/" | ||
|
||
# LOOP OVER # OF CORES FOR BENCHMARK | ||
for nCores in MyCPUs: | ||
# nodes used in fit (for every 64 CPUs allow an additional node) | ||
nNodes = nCores/64 + 1 | ||
|
||
# create output directories | ||
MyRunningDir = MyOutDir + "cpu%03d" % nCores | ||
MyLogOutDir = MyRunningDir + "/log" | ||
if not os.path.exists(MyOutDir): | ||
os.makedirs(MyOutDir) | ||
if not os.path.exists(MyRunningDir): | ||
os.makedirs(MyRunningDir) | ||
if not os.path.exists(MyLogOutDir): | ||
os.makedirs(MyLogOutDir) | ||
|
||
# create slurm submission script | ||
slurmOut = open("tempSlurm.txt",'w') | ||
slurmOut.write("#!/bin/csh \n") | ||
slurmOut.write("#SBATCH --nodes=%d \n" % nNodes) | ||
slurmOut.write("#SBATCH --partition=%s \n" % PARTITION) | ||
slurmOut.write("#SBATCH --constraint=%s \n" % CONSTRAINT) | ||
slurmOut.write("#SBATCH --cpus-per-task=1 \n") | ||
slurmOut.write("#SBATCH --ntasks-per-core=1 \n") | ||
slurmOut.write("#SBATCH --threads-per-core=1 \n") | ||
slurmOut.write("#SBATCH --mem=%dGB \n" % nCores) # 1 GB per core | ||
slurmOut.write("#SBATCH --time=%s \n" % TIMELIMIT) | ||
slurmOut.write("#SBATCH --ntasks=%d \n" % (nCores+1)) | ||
|
||
slurmOut.write("#SBATCH --chdir=%s \n" % MyRunningDir) | ||
slurmOut.write("#SBATCH --error=%s/fit.err \n" % (MyLogOutDir)) | ||
slurmOut.write("#SBATCH --output=%s/fit.out \n" % (MyLogOutDir)) | ||
slurmOut.write("#SBATCH --job-name=benchfit_%03d \n\n\n" % nCores) | ||
|
||
# commands to execute during job | ||
slurmOut.write("pwd \n") | ||
slurmOut.write("source %s \n" % MyEnv) | ||
slurmOut.write("mpirun %s fitMPI -c %s %s \n" % (MyMPIOpt, MyConfig, MyFitOpt)) | ||
slurmOut.close() | ||
|
||
# submit individual job | ||
print("Submitting %d core job on %d nodes" % (nCores, nNodes)) | ||
subprocess.call(["sbatch", "tempSlurm.txt"]) | ||
os.remove("tempSlurm.txt") | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys | ||
import os | ||
import subprocess | ||
import math | ||
import pwd | ||
from optparse import OptionParser | ||
|
||
########################################################## MAIN ########################################################## | ||
def main(argv): | ||
|
||
# SLURM INFO (see options at https://scicomp.jlab.org/scicomp/slurmJob/slurmInfo) | ||
PARTITION = "gpu" | ||
GPUTYPE = "T4" | ||
TIMELIMIT = "24:00:00" # Max walltime | ||
MyGPUs = [1,2,3,4] # List of GPU cards to use in benchmark fits | ||
|
||
# User provided environment, fit configuration and options | ||
MyEnv = "/work/halld2/home/jrsteven/2021-amptools/builds_gpu/setup_gluex_dev.csh" | ||
MyConfig = "/work/halld2/home/jrsteven/forBenchmark/benchmark.cfg" | ||
MyMPIOpt = "--mca btl_openib_allow_ib 1" | ||
MyFitOpt = "-m 100000 -r 5" | ||
MyOutDir = "/volatile/halld/home/" + pwd.getpwuid( os.getuid() )[0] + "/benchmark/" | ||
|
||
# LOOP OVER # OF GPUs FOR BENCHMARK | ||
for nGPUs in MyGPUs: | ||
|
||
# Two types of nodes/GPUs (sciml19 and sciml21), both with 3 each | ||
nNodes = 1 | ||
if GPUTYPE=="T4": # 16 allowed in a single job | ||
if nGPUs > 8: nNodes=2 | ||
if GPUTYPE=="TitanRTX": # 4 allowed in a single job | ||
if nGPUs > 4: nNodes=2 | ||
if nGPUs > 8: nNodes=3 | ||
|
||
# create output directories | ||
MyRunningDir = MyOutDir + "gpu%s%03d" % (GPUTYPE,nGPUs) | ||
MyLogOutDir = MyRunningDir + "/log" | ||
if not os.path.exists(MyOutDir): | ||
os.makedirs(MyOutDir) | ||
if not os.path.exists(MyRunningDir): | ||
os.makedirs(MyRunningDir) | ||
if not os.path.exists(MyLogOutDir): | ||
os.makedirs(MyLogOutDir) | ||
|
||
# create slurm submission script | ||
slurmOut = open("tempSlurm.txt",'w') | ||
slurmOut.write("#!/bin/csh \n") | ||
slurmOut.write("#SBATCH --nodes=%d \n" % nNodes) | ||
slurmOut.write("#SBATCH --partition=%s \n" % PARTITION) | ||
slurmOut.write("#SBATCH --gres=gpu:%s:%d \n" % (GPUTYPE,nGPUs)) | ||
slurmOut.write("#SBATCH --cpus-per-task=1 \n") | ||
slurmOut.write("#SBATCH --ntasks-per-core=1 \n") | ||
slurmOut.write("#SBATCH --threads-per-core=1 \n") | ||
slurmOut.write("#SBATCH --mem=20GB \n") # multiplied by nGPUs in slurm? | ||
slurmOut.write("#SBATCH --time=%s \n" % TIMELIMIT) | ||
slurmOut.write("#SBATCH --ntasks=%d \n" % (nGPUs+1)) | ||
|
||
slurmOut.write("#SBATCH --chdir=%s \n" % MyRunningDir) | ||
slurmOut.write("#SBATCH --error=%s/fit.err \n" % (MyLogOutDir)) | ||
slurmOut.write("#SBATCH --output=%s/fit.out \n" % (MyLogOutDir)) | ||
slurmOut.write("#SBATCH --job-name=benchfitgpu_%03d \n\n\n" % nGPUs) | ||
|
||
# commands to execute during job | ||
slurmOut.write("pwd \n") | ||
slurmOut.write("source %s \n" % MyEnv) | ||
slurmOut.write("mpirun %s fitMPI -c %s %s \n" % (MyMPIOpt, MyConfig, MyFitOpt)) | ||
slurmOut.close() | ||
|
||
# submit individual job | ||
print("Submitting %d GPU job on %d %s nodes" % (nGPUs, nNodes, GPUTYPE)) | ||
subprocess.call(["sbatch", "tempSlurm.txt"]) | ||
os.remove("tempSlurm.txt") | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |