From f99f0b2ce88cd74137dd92450a53eb97e7622b3a Mon Sep 17 00:00:00 2001 From: Farah <49493059+salhanyf@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:27:23 -0400 Subject: [PATCH 01/10] pytorch folder and files --- src/pytorch/README.md | 0 src/pytorch/firsttime.sh | 13 +++++ src/pytorch/pytorch-multicpu.sh | 18 +++++++ src/pytorch/torch-multicpu.py | 91 +++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+) create mode 100644 src/pytorch/README.md create mode 100644 src/pytorch/firsttime.sh create mode 100644 src/pytorch/pytorch-multicpu.sh create mode 100644 src/pytorch/torch-multicpu.py diff --git a/src/pytorch/README.md b/src/pytorch/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/pytorch/firsttime.sh b/src/pytorch/firsttime.sh new file mode 100644 index 0000000..490a3f9 --- /dev/null +++ b/src/pytorch/firsttime.sh @@ -0,0 +1,13 @@ +#### Virtual ENV creation. One time only: +#!/encs/bin/tcsh +cd /speed-scratch/$USER +salloc -p ps --mem=20Gb -A Your_group_id(if you have one) +module load python/3.9.18/default +setenv TMP /speed-scratch/$USER/tmp +setenv TMPDIR /speed-scratch/$USER/tmp +python -m venv $TMPDIR/pytorchcpu +source $TMPDIR/pytorchcpu/bin/activate.csh +pip install torch torchvision +pip install urllib3==1.26.6 +deactivate +### END of VEnv creation \ No newline at end of file diff --git a/src/pytorch/pytorch-multicpu.sh b/src/pytorch/pytorch-multicpu.sh new file mode 100644 index 0000000..0d9bc3a --- /dev/null +++ b/src/pytorch/pytorch-multicpu.sh @@ -0,0 +1,18 @@ +#!/encs/bin/tcsh +#SBATCH --nodes 1 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance + +#SBATCH --mem=24G +#SBATCH --time=0:05:00 +#SBATCH --output=%N-%j.out + +source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh +echo "starting training..." + +time python torch-multicpu.py +######## END of Job Script + +### Slurm Script execution +sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one) +### End of slurm script \ No newline at end of file diff --git a/src/pytorch/torch-multicpu.py b/src/pytorch/torch-multicpu.py new file mode 100644 index 0000000..27721f3 --- /dev/null +++ b/src/pytorch/torch-multicpu.py @@ -0,0 +1,91 @@ +### Python script torch-multicpu.py + +import numpy as np +import time + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.distributed as distri + +import torchvision +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 +from torch.utils.data import DataLoader + +import argparse +import os + +parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test') +parser.add_argument('--lr', default=0.1, help='') +parser.add_argument('--batch_size', type=int, default=512, help='') +parser.add_argument('--num_workers', type=int, default=0, help='') + +def main(): + + args = parser.parse_args() + torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK'])) + class Net(nn.Module): + + def __init__(self): + super(Net, self).__init__() + + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + net = Net() + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=args.lr) + + transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data + ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from + ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data + + dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) + + train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) + + perf = [] + + total_start = time.time() + + for batch_idx, (inputs, targets) in enumerate(train_loader): + + start = time.time() + + outputs = net(inputs) + loss = criterion(outputs, targets) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + batch_time = time.time() - start + + images_per_sec = args.batch_size/batch_time + + perf.append(images_per_sec) + + total_time = time.time() - total_start + +if __name__=='__main__': + main() + +### END of Python script \ No newline at end of file From be0883c469f5154a578037b729b3ee010f0c9e88 Mon Sep 17 00:00:00 2001 From: Farah <49493059+salhanyf@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:30:21 -0400 Subject: [PATCH 02/10] readme files --- src/pytorch/README.md | 4 ++++ src/pytorch/pytorch-multicpu.sh | 7 +------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/pytorch/README.md b/src/pytorch/README.md index e69de29..3c8fe17 100644 --- a/src/pytorch/README.md +++ b/src/pytorch/README.md @@ -0,0 +1,4 @@ + +# README + +This directory has example scripts to using Pytorch with Python virtual environment to run on CPUs \ No newline at end of file diff --git a/src/pytorch/pytorch-multicpu.sh b/src/pytorch/pytorch-multicpu.sh index 0d9bc3a..0b71ee5 100644 --- a/src/pytorch/pytorch-multicpu.sh +++ b/src/pytorch/pytorch-multicpu.sh @@ -10,9 +10,4 @@ source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh echo "starting training..." -time python torch-multicpu.py -######## END of Job Script - -### Slurm Script execution -sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one) -### End of slurm script \ No newline at end of file +time python torch-multicpu.py \ No newline at end of file From 88d822bbad6f3aad927a0fe48719434805d28577 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 19:23:29 -0400 Subject: [PATCH 03/10] [example][pytorch-multicpu] for now name the dir per example name --- src/{pytorch => pytorch-multicpu}/README.md | 0 src/{pytorch => pytorch-multicpu}/firsttime.sh | 0 src/{pytorch => pytorch-multicpu}/pytorch-multicpu.sh | 0 src/{pytorch => pytorch-multicpu}/torch-multicpu.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename src/{pytorch => pytorch-multicpu}/README.md (100%) rename src/{pytorch => pytorch-multicpu}/firsttime.sh (100%) rename src/{pytorch => pytorch-multicpu}/pytorch-multicpu.sh (100%) rename src/{pytorch => pytorch-multicpu}/torch-multicpu.py (100%) diff --git a/src/pytorch/README.md b/src/pytorch-multicpu/README.md similarity index 100% rename from src/pytorch/README.md rename to src/pytorch-multicpu/README.md diff --git a/src/pytorch/firsttime.sh b/src/pytorch-multicpu/firsttime.sh similarity index 100% rename from src/pytorch/firsttime.sh rename to src/pytorch-multicpu/firsttime.sh diff --git a/src/pytorch/pytorch-multicpu.sh b/src/pytorch-multicpu/pytorch-multicpu.sh similarity index 100% rename from src/pytorch/pytorch-multicpu.sh rename to src/pytorch-multicpu/pytorch-multicpu.sh diff --git a/src/pytorch/torch-multicpu.py b/src/pytorch-multicpu/torch-multicpu.py similarity index 100% rename from src/pytorch/torch-multicpu.py rename to src/pytorch-multicpu/torch-multicpu.py From 97ce6ae729f5a15a42b9aa660392ae7c213341f8 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 19:25:20 -0400 Subject: [PATCH 04/10] [examples][pytorch-multicpu] clean up original .txt --- src/pytorch-multicpu.txt | 134 --------------------------------------- 1 file changed, 134 deletions(-) delete mode 100644 src/pytorch-multicpu.txt diff --git a/src/pytorch-multicpu.txt b/src/pytorch-multicpu.txt deleted file mode 100644 index 4b10bf8..0000000 --- a/src/pytorch-multicpu.txt +++ /dev/null @@ -1,134 +0,0 @@ -## Adapted to SPEED from: -## https://docs.alliancecan.ca/wiki/PyTorch - -# Virtual Environment creation, python script at the end of slurm script execution section -#### Virtual ENV creation. One time only: -cd /speed-scratch/$USER -salloc -p ps --mem=20Gb -A Your_group_id(if you have one) -module load python/3.9.18/default -setenv TMP /speed-scratch/$USER/tmp -setenv TMPDIR /speed-scratch/$USER/tmp -python -m venv $TMPDIR/pytorchcpu -source $TMPDIR/pytorchcpu/bin/activate.csh -pip install torch torchvision -pip install urllib3==1.26.6 -deactivate -### END of VEnv creation - - -# job script: pytorch-multicpu.sh - -#!/encs/bin/tcsh -#SBATCH --nodes 1 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance - -#SBATCH --mem=24G -#SBATCH --time=0:05:00 -#SBATCH --output=%N-%j.out - -source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh -echo "starting training..." - -time python torch-multicpu.py -######## END of Job Script - -### Slurm Script execution -sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one) -### End of slurm script - - - - -### Python script torch-multicpu.py - -import numpy as np -import time - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.distributed as distri - -import torchvision -import torchvision.transforms as transforms -from torchvision.datasets import CIFAR10 -from torch.utils.data import DataLoader - -import argparse -import os - -parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test') -parser.add_argument('--lr', default=0.1, help='') -parser.add_argument('--batch_size', type=int, default=512, help='') -parser.add_argument('--num_workers', type=int, default=0, help='') - -def main(): - - args = parser.parse_args() - torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK'])) - class Net(nn.Module): - - def __init__(self): - super(Net, self).__init__() - - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - net = Net() - - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=args.lr) - - transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - - ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data - ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from - ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data - - dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) - - train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) - - perf = [] - - total_start = time.time() - - for batch_idx, (inputs, targets) in enumerate(train_loader): - - start = time.time() - - outputs = net(inputs) - loss = criterion(outputs, targets) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - batch_time = time.time() - start - - images_per_sec = args.batch_size/batch_time - - perf.append(images_per_sec) - - total_time = time.time() - total_start - -if __name__=='__main__': - main() - -### END of Python script - From ebcdd506fbc115b4b2895e5c44c7f7be876a9972 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 19:26:24 -0400 Subject: [PATCH 05/10] [examples][pytorch-multicpu] chmod u+x --- src/pytorch-multicpu/firsttime.sh | 0 src/pytorch-multicpu/pytorch-multicpu.sh | 0 src/pytorch-multicpu/torch-multicpu.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 src/pytorch-multicpu/firsttime.sh mode change 100644 => 100755 src/pytorch-multicpu/pytorch-multicpu.sh mode change 100644 => 100755 src/pytorch-multicpu/torch-multicpu.py diff --git a/src/pytorch-multicpu/firsttime.sh b/src/pytorch-multicpu/firsttime.sh old mode 100644 new mode 100755 diff --git a/src/pytorch-multicpu/pytorch-multicpu.sh b/src/pytorch-multicpu/pytorch-multicpu.sh old mode 100644 new mode 100755 diff --git a/src/pytorch-multicpu/torch-multicpu.py b/src/pytorch-multicpu/torch-multicpu.py old mode 100644 new mode 100755 From 0d5d9f97f61e6065a36a04e02e6037fc2eee4297 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 19:32:08 -0400 Subject: [PATCH 06/10] [examples][pytortch-multicpu] add commands to README --- src/pytorch-multicpu/README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/pytorch-multicpu/README.md b/src/pytorch-multicpu/README.md index 3c8fe17..2db83be 100644 --- a/src/pytorch-multicpu/README.md +++ b/src/pytorch-multicpu/README.md @@ -1,4 +1,17 @@ - -# README - -This directory has example scripts to using Pytorch with Python virtual environment to run on CPUs \ No newline at end of file + +# README + +This directory has example scripts to using Pytorch with Python virtual environment to run on multiple CPUs (most universal). +Adapted to SPEED from https://docs.alliancecan.ca/wiki/PyTorch + +``` +(speed-submit) +cd /speed-scratch/$USER +salloc -p ps --mem=20G --mail-type=ALL +(compute node) +./firsttime.sh +exit +(speed-submit) +sbatch -p ps pytorch-multicpu.sh +tail -f slurm-JOBID.out +``` From 24f7cf0790e3ad5d4c0ebf29ccb1d7d79d3b9988 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 20:20:46 -0400 Subject: [PATCH 07/10] [examples][pytorch-multicpu] adjust scripts per tests --- src/pytorch-multicpu/firsttime.sh | 9 ++++++--- src/pytorch-multicpu/pytorch-multicpu.sh | 7 ++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/pytorch-multicpu/firsttime.sh b/src/pytorch-multicpu/firsttime.sh index 490a3f9..6cae1b8 100755 --- a/src/pytorch-multicpu/firsttime.sh +++ b/src/pytorch-multicpu/firsttime.sh @@ -1,7 +1,9 @@ -#### Virtual ENV creation. One time only: #!/encs/bin/tcsh + +#### Virtual ENV creation. One time only. +#### Run after salloc to a compute node! + cd /speed-scratch/$USER -salloc -p ps --mem=20Gb -A Your_group_id(if you have one) module load python/3.9.18/default setenv TMP /speed-scratch/$USER/tmp setenv TMPDIR /speed-scratch/$USER/tmp @@ -10,4 +12,5 @@ source $TMPDIR/pytorchcpu/bin/activate.csh pip install torch torchvision pip install urllib3==1.26.6 deactivate -### END of VEnv creation \ No newline at end of file + +#### END of VEnv creation diff --git a/src/pytorch-multicpu/pytorch-multicpu.sh b/src/pytorch-multicpu/pytorch-multicpu.sh index 0b71ee5..6246bd0 100755 --- a/src/pytorch-multicpu/pytorch-multicpu.sh +++ b/src/pytorch-multicpu/pytorch-multicpu.sh @@ -1,4 +1,5 @@ #!/encs/bin/tcsh + #SBATCH --nodes 1 #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance @@ -7,7 +8,11 @@ #SBATCH --time=0:05:00 #SBATCH --output=%N-%j.out +#SBATCH --mail-type=ALL + source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh echo "starting training..." -time python torch-multicpu.py \ No newline at end of file +time srun python torch-multicpu.py + +# EOF From 6b0934762cb8ea110cd0662e0104654e73a107d5 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 20:21:25 -0400 Subject: [PATCH 08/10] [examples][torch-multicpu] adjust Python script for training. This includes 5 epochs for now instead of 512, some printouts, and set dataset download to True. Also dos2unix it. --- src/pytorch-multicpu/torch-multicpu.py | 189 +++++++++++++------------ 1 file changed, 98 insertions(+), 91 deletions(-) diff --git a/src/pytorch-multicpu/torch-multicpu.py b/src/pytorch-multicpu/torch-multicpu.py index 27721f3..87c52d7 100755 --- a/src/pytorch-multicpu/torch-multicpu.py +++ b/src/pytorch-multicpu/torch-multicpu.py @@ -1,91 +1,98 @@ -### Python script torch-multicpu.py - -import numpy as np -import time - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.distributed as distri - -import torchvision -import torchvision.transforms as transforms -from torchvision.datasets import CIFAR10 -from torch.utils.data import DataLoader - -import argparse -import os - -parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test') -parser.add_argument('--lr', default=0.1, help='') -parser.add_argument('--batch_size', type=int, default=512, help='') -parser.add_argument('--num_workers', type=int, default=0, help='') - -def main(): - - args = parser.parse_args() - torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK'])) - class Net(nn.Module): - - def __init__(self): - super(Net, self).__init__() - - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - net = Net() - - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=args.lr) - - transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - - ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data - ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from - ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data - - dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) - - train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) - - perf = [] - - total_start = time.time() - - for batch_idx, (inputs, targets) in enumerate(train_loader): - - start = time.time() - - outputs = net(inputs) - loss = criterion(outputs, targets) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - batch_time = time.time() - start - - images_per_sec = args.batch_size/batch_time - - perf.append(images_per_sec) - - total_time = time.time() - total_start - -if __name__=='__main__': - main() - -### END of Python script \ No newline at end of file +### Python script torch-multicpu.py + +import numpy as np +import time + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.distributed as distri + +import torchvision +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 +from torch.utils.data import DataLoader + +import argparse +import os + +parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test') +parser.add_argument('--lr', default=0.1, help='') +#parser.add_argument('--batch_size', type=int, default=512, help='') +parser.add_argument('--batch_size', type=int, default=4, help='') +#parser.add_argument('--num_workers', type=int, default=0, help='') +parser.add_argument('--num_workers', type=int, default=4, help='') + +def main(): + + args = parser.parse_args() + torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK'])) + class Net(nn.Module): + + def __init__(self): + super(Net, self).__init__() + + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + net = Net() + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=args.lr) + + transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data + ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from + ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data + + #dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) + dataset_train = CIFAR10(root='./data', train=True, download=True, transform=transform_train) + + train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) + + perf = [] + + total_start = time.time() + + for batch_idx, (inputs, targets) in enumerate(train_loader): + + start = time.time() + + outputs = net(inputs) + loss = criterion(outputs, targets) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + batch_time = time.time() - start + + images_per_sec = args.batch_size/batch_time + + perf.append(images_per_sec) + + total_time = time.time() - total_start + + print("Total time: ", total_time) + print("Perf : ", perf) + print("CPUs : ", os.environ['SLURM_CPUS_PER_TASK']) + +if __name__=='__main__': + main() + +### END of Python script From e5b0cfba969c1d43dacb0395a0a31c01dfd47d57 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 20:22:50 -0400 Subject: [PATCH 09/10] [examples][pytorch-multicpu][README] adjust a bit with timing --- src/pytorch-multicpu/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytorch-multicpu/README.md b/src/pytorch-multicpu/README.md index 2db83be..fa5d80b 100644 --- a/src/pytorch-multicpu/README.md +++ b/src/pytorch-multicpu/README.md @@ -9,7 +9,8 @@ Adapted to SPEED from https://docs.alliancecan.ca/wiki/PyTorch cd /speed-scratch/$USER salloc -p ps --mem=20G --mail-type=ALL (compute node) -./firsttime.sh +time ./firsttime.sh +(takes about 5 mins) exit (speed-submit) sbatch -p ps pytorch-multicpu.sh From c81b2d45c29809e8fde5918a1ba3fec01f009fea Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Mon, 17 Jun 2024 20:26:30 -0400 Subject: [PATCH 10/10] [examples][pytorch-multicpu][README] updae out filename --- src/pytorch-multicpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch-multicpu/README.md b/src/pytorch-multicpu/README.md index fa5d80b..83d3b94 100644 --- a/src/pytorch-multicpu/README.md +++ b/src/pytorch-multicpu/README.md @@ -14,5 +14,5 @@ time ./firsttime.sh exit (speed-submit) sbatch -p ps pytorch-multicpu.sh -tail -f slurm-JOBID.out +tail -f EXECUTIONHOST-JOBID.out ```