Skip to content

Commit

Permalink
Merge pull request JeffersonLab#181 from JeffersonLab/fit_benchmark
Browse files Browse the repository at this point in the history
Scripts for testing AmpTools fit speed with many CPUs or GPUs
  • Loading branch information
jrstevenjlab authored Jan 24, 2022
2 parents 36dcdc6 + b4b26d3 commit f4a652f
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 0 deletions.
14 changes: 14 additions & 0 deletions PWA_scripts/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Introduction
This directory contains scripts for submitting batch jobs with MPI (and GPU). They are meant to determine how your fit speed improves (or not) by adding additioinal resources
* submit.py -- submits MPI jobs with various # of cores
* submitGPU.py -- submits MPI+GPU jobs with various # of GPUs
* plotBenchmark.C -- plots fit speed for the benchmark jobs

# Required user modifications
* In submit.py and submitGPU.py you should replace the MyEnv, MyConfig and MyOutDir variables with your own environment setup script, AmpTools fit configuration and output directory location
* You can change the MyCPU or MyGPU list to contain different amounts of cores for your benchmark if you want to test with more or fewer cores/GPUs

# Notes:
* These fits require using the MPI compiled version of AmpTools and halld_sim, see https://halldweb.jlab.org/wiki/index.php/HOWTO_use_AmpTools_on_the_JLab_farm_with_MPI for more details
* Only run the GPU version of the fitter if your fit utilizes a GPU accelerated amplitude and you've compiled that amplitude with the GPU (CUDA) libraries on one of the sciml nodes, see https://halldweb.jlab.org/wiki/index.php/HOWTO_use_AmpTools_on_the_JLab_farm_GPUs for more details
* Some of these default benchmarks require many CPUs or GPUs and may take some time for those nodes to become available on the ifarm/sciml nodes, so be patient.
123 changes: 123 additions & 0 deletions PWA_scripts/benchmark/plot_benchmark.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <bits/stdc++.h>
#include <string>
#include <sstream>

void plot_benchmark(TString dir = "./") {

gStyle->SetOptStat(0);

// initialize list of nCores to plot
vector<int> numThreadsCPU = {1,2,4,8,16,32,64,96,128};
int numTestCPU = numThreadsCPU.size();

// for GPU fits, only add if desired
vector<int> numThreadsGPUT4 = {1,2,3,4,6,8,10,12};
vector<int> numThreadsGPURTX = {};

// names of directories containing benchmark results
vector<TString> types = {"cpu"};
vector<TGraphErrors*> grBenchmarkScan;
if(numThreadsGPUT4.size() > 0) types.push_back("gpuT4");
if(numThreadsGPURTX.size() > 0) types.push_back("gpuTitanRTX");

TH1F *hBenchmarkScan = new TH1F("hBenchmarkScan","; Number of GPUs or CPUs; Fit speed (Likelihood function call rate [Hz])", 200, 0, 200);
double maxRate = 0;

// loop over different achitecture types to plot results
for(int itype=0; itype<types.size(); itype++) {
vector<int> numThreads = numThreadsCPU;
if(types[itype] == "gpuT4") numThreads = numThreadsGPUT4;
if(types[itype] == "gpuTitanRTX") numThreads = numThreadsGPURTX;
grBenchmarkScan.push_back(new TGraphErrors(numThreads.size()));

// loop over number of threads in test
for(int ithread=0; ithread<numThreads.size(); ithread++) {

int nThreads = numThreads[ithread];
string spath = Form("%s/%s%03d/log/fit.out", dir.Data(), types[itype].Data(), nThreads);
//cout << spath << endl;

std::string read_line;
ifstream file(spath);
double parValue = 0;
double parAvg = 0;
vector<double> parSq;
int nValues = 0;
while (std::getline(file, read_line)) {

TString line = read_line;
if(line.Contains("time ")) {
line.ReplaceAll("average time per function call: ","");
line.ReplaceAll(" ms.","");
parValue = 1./(atof(line)/1000);
parAvg += parValue;
parSq.push_back(parValue*parValue);
nValues++;
}
else continue;

}

if(nValues > 0) {
parAvg /= float(nValues);
double parRms = 0;
for(uint ip=0; ip<parSq.size(); ip++)
parRms += (parSq.at(ip) + parAvg*parAvg - 2*sqrt(parSq.at(ip))*parAvg);
parRms /= float(nValues);
parRms = sqrt(parRms);
if(parAvg > maxRate) maxRate = parAvg;
//cout<<parAvg<<" "<<parRms<<endl;
if(parRms < 1e-9) parRms = 0.01;
grBenchmarkScan[itype]->SetPoint(ithread, nThreads, parAvg);
grBenchmarkScan[itype]->SetPointError(ithread, 0, parRms);
}
}
}

TCanvas *cc = new TCanvas("cc","cc",800,400);
auto legend = new TLegend(0.47,0.17,0.9,0.42);

hBenchmarkScan->SetMaximum(maxRate*2.5);
hBenchmarkScan->SetMinimum(0.1);
hBenchmarkScan->Draw();
vector<TF1*> fit;
for(int itype=0; itype<types.size(); itype++) {
grBenchmarkScan[itype]->SetMarkerStyle(20);
grBenchmarkScan[itype]->SetMarkerColor(kBlack+itype);
grBenchmarkScan[itype]->Draw("same pl");

fit.push_back(new TF1(types[itype],"pol1",1,200));
fit[itype]->FixParameter(0,0);
grBenchmarkScan[itype]->Fit(fit[itype],"N","",0.5,2);
fit[itype]->SetLineColor(kBlack+itype); fit[itype]->SetLineStyle(kDashed);
fit[itype]->Draw("same");

if(itype==0)
legend->AddEntry(grBenchmarkScan[0],"ifarm19 CPU (2 thread/core)","pl");
if(types[itype] == "gpuT4")
legend->AddEntry(grBenchmarkScan[itype],"sciml21 T4 GPU","pl");
if(types[itype] == "gpuTitanRTX")
legend->AddEntry(grBenchmarkScan[itype],"sciml19 Titan RTX GPU","pl");
}

gPad->SetLeftMargin(0.09);
gPad->SetBottomMargin(0.15);
gPad->SetTopMargin(0.05);
gPad->SetRightMargin(0.05);
gPad->SetLogx(); gPad->SetLogy();
gPad->SetGridy(); gPad->SetGridx();
hBenchmarkScan->GetXaxis()->SetTitleSize(0.05);
hBenchmarkScan->GetYaxis()->SetTitleSize(0.05);
hBenchmarkScan->GetXaxis()->SetTitleOffset(1.3);
hBenchmarkScan->GetYaxis()->SetTitleOffset(0.8);

legend->SetFillColor(0);
legend->Draw();

cc->Print("benchmark.png");

return;
}
72 changes: 72 additions & 0 deletions PWA_scripts/benchmark/submit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python

import sys
import os
import subprocess
import math
import pwd
from optparse import OptionParser

########################################################## MAIN ##########################################################
def main(argv):

# SLURM INFO (see options at https://scicomp.jlab.org/scicomp/slurmJob/slurmInfo)
PARTITION = "ifarm"
CONSTRAINT = "farm19"
TIMELIMIT = "24:00:00" # Max walltime
MyCPUs = [1,2,4,8,16,32,64,96,128,192] # List of CPU cores to use in benchmark fits

# User provided environment, fit configuration and options
MyEnv = "/work/halld2/home/jrsteven/analysisGluexI/builds/setup_gluex_scanParam.csh"
MyConfig = "/work/halld2/home/jrsteven/forBenchmark/benchmark.cfg"
MyMPIOpt = "--mca btl_openib_allow_ib 1"
MyFitOpt = "-m 100000 -r 5"
MyOutDir = "/volatile/halld/home/" + pwd.getpwuid( os.getuid() )[0] + "/benchmark/"

# LOOP OVER # OF CORES FOR BENCHMARK
for nCores in MyCPUs:
# nodes used in fit (for every 64 CPUs allow an additional node)
nNodes = nCores/64 + 1

# create output directories
MyRunningDir = MyOutDir + "cpu%03d" % nCores
MyLogOutDir = MyRunningDir + "/log"
if not os.path.exists(MyOutDir):
os.makedirs(MyOutDir)
if not os.path.exists(MyRunningDir):
os.makedirs(MyRunningDir)
if not os.path.exists(MyLogOutDir):
os.makedirs(MyLogOutDir)

# create slurm submission script
slurmOut = open("tempSlurm.txt",'w')
slurmOut.write("#!/bin/csh \n")
slurmOut.write("#SBATCH --nodes=%d \n" % nNodes)
slurmOut.write("#SBATCH --partition=%s \n" % PARTITION)
slurmOut.write("#SBATCH --constraint=%s \n" % CONSTRAINT)
slurmOut.write("#SBATCH --cpus-per-task=1 \n")
slurmOut.write("#SBATCH --ntasks-per-core=1 \n")
slurmOut.write("#SBATCH --threads-per-core=1 \n")
slurmOut.write("#SBATCH --mem=%dGB \n" % nCores) # 1 GB per core
slurmOut.write("#SBATCH --time=%s \n" % TIMELIMIT)
slurmOut.write("#SBATCH --ntasks=%d \n" % (nCores+1))

slurmOut.write("#SBATCH --chdir=%s \n" % MyRunningDir)
slurmOut.write("#SBATCH --error=%s/fit.err \n" % (MyLogOutDir))
slurmOut.write("#SBATCH --output=%s/fit.out \n" % (MyLogOutDir))
slurmOut.write("#SBATCH --job-name=benchfit_%03d \n\n\n" % nCores)

# commands to execute during job
slurmOut.write("pwd \n")
slurmOut.write("source %s \n" % MyEnv)
slurmOut.write("mpirun %s fitMPI -c %s %s \n" % (MyMPIOpt, MyConfig, MyFitOpt))
slurmOut.close()

# submit individual job
print("Submitting %d core job on %d nodes" % (nCores, nNodes))
subprocess.call(["sbatch", "tempSlurm.txt"])
os.remove("tempSlurm.txt")


if __name__ == "__main__":
main(sys.argv[1:])
78 changes: 78 additions & 0 deletions PWA_scripts/benchmark/submitGPU.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python

import sys
import os
import subprocess
import math
import pwd
from optparse import OptionParser

########################################################## MAIN ##########################################################
def main(argv):

# SLURM INFO (see options at https://scicomp.jlab.org/scicomp/slurmJob/slurmInfo)
PARTITION = "gpu"
GPUTYPE = "T4"
TIMELIMIT = "24:00:00" # Max walltime
MyGPUs = [1,2,3,4] # List of GPU cards to use in benchmark fits

# User provided environment, fit configuration and options
MyEnv = "/work/halld2/home/jrsteven/2021-amptools/builds_gpu/setup_gluex_dev.csh"
MyConfig = "/work/halld2/home/jrsteven/forBenchmark/benchmark.cfg"
MyMPIOpt = "--mca btl_openib_allow_ib 1"
MyFitOpt = "-m 100000 -r 5"
MyOutDir = "/volatile/halld/home/" + pwd.getpwuid( os.getuid() )[0] + "/benchmark/"

# LOOP OVER # OF GPUs FOR BENCHMARK
for nGPUs in MyGPUs:

# Two types of nodes/GPUs (sciml19 and sciml21), both with 3 each
nNodes = 1
if GPUTYPE=="T4": # 16 allowed in a single job
if nGPUs > 8: nNodes=2
if GPUTYPE=="TitanRTX": # 4 allowed in a single job
if nGPUs > 4: nNodes=2
if nGPUs > 8: nNodes=3

# create output directories
MyRunningDir = MyOutDir + "gpu%s%03d" % (GPUTYPE,nGPUs)
MyLogOutDir = MyRunningDir + "/log"
if not os.path.exists(MyOutDir):
os.makedirs(MyOutDir)
if not os.path.exists(MyRunningDir):
os.makedirs(MyRunningDir)
if not os.path.exists(MyLogOutDir):
os.makedirs(MyLogOutDir)

# create slurm submission script
slurmOut = open("tempSlurm.txt",'w')
slurmOut.write("#!/bin/csh \n")
slurmOut.write("#SBATCH --nodes=%d \n" % nNodes)
slurmOut.write("#SBATCH --partition=%s \n" % PARTITION)
slurmOut.write("#SBATCH --gres=gpu:%s:%d \n" % (GPUTYPE,nGPUs))
slurmOut.write("#SBATCH --cpus-per-task=1 \n")
slurmOut.write("#SBATCH --ntasks-per-core=1 \n")
slurmOut.write("#SBATCH --threads-per-core=1 \n")
slurmOut.write("#SBATCH --mem=20GB \n") # multiplied by nGPUs in slurm?
slurmOut.write("#SBATCH --time=%s \n" % TIMELIMIT)
slurmOut.write("#SBATCH --ntasks=%d \n" % (nGPUs+1))

slurmOut.write("#SBATCH --chdir=%s \n" % MyRunningDir)
slurmOut.write("#SBATCH --error=%s/fit.err \n" % (MyLogOutDir))
slurmOut.write("#SBATCH --output=%s/fit.out \n" % (MyLogOutDir))
slurmOut.write("#SBATCH --job-name=benchfitgpu_%03d \n\n\n" % nGPUs)

# commands to execute during job
slurmOut.write("pwd \n")
slurmOut.write("source %s \n" % MyEnv)
slurmOut.write("mpirun %s fitMPI -c %s %s \n" % (MyMPIOpt, MyConfig, MyFitOpt))
slurmOut.close()

# submit individual job
print("Submitting %d GPU job on %d %s nodes" % (nGPUs, nNodes, GPUTYPE))
subprocess.call(["sbatch", "tempSlurm.txt"])
os.remove("tempSlurm.txt")


if __name__ == "__main__":
main(sys.argv[1:])

0 comments on commit f4a652f

Please sign in to comment.