From f99f0b2ce88cd74137dd92450a53eb97e7622b3a Mon Sep 17 00:00:00 2001
From: Farah <49493059+salhanyf@users.noreply.github.com>
Date: Mon, 17 Jun 2024 17:27:23 -0400
Subject: [PATCH 01/10] pytorch folder and files
---
src/pytorch/README.md | 0
src/pytorch/firsttime.sh | 13 +++++
src/pytorch/pytorch-multicpu.sh | 18 +++++++
src/pytorch/torch-multicpu.py | 91 +++++++++++++++++++++++++++++++++
4 files changed, 122 insertions(+)
create mode 100644 src/pytorch/README.md
create mode 100644 src/pytorch/firsttime.sh
create mode 100644 src/pytorch/pytorch-multicpu.sh
create mode 100644 src/pytorch/torch-multicpu.py
diff --git a/src/pytorch/README.md b/src/pytorch/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/pytorch/firsttime.sh b/src/pytorch/firsttime.sh
new file mode 100644
index 0000000..490a3f9
--- /dev/null
+++ b/src/pytorch/firsttime.sh
@@ -0,0 +1,13 @@
+#### Virtual ENV creation. One time only:
+#!/encs/bin/tcsh
+cd /speed-scratch/$USER
+salloc -p ps --mem=20Gb -A Your_group_id(if you have one)
+module load python/3.9.18/default
+setenv TMP /speed-scratch/$USER/tmp
+setenv TMPDIR /speed-scratch/$USER/tmp
+python -m venv $TMPDIR/pytorchcpu
+source $TMPDIR/pytorchcpu/bin/activate.csh
+pip install torch torchvision
+pip install urllib3==1.26.6
+deactivate
+### END of VEnv creation
\ No newline at end of file
diff --git a/src/pytorch/pytorch-multicpu.sh b/src/pytorch/pytorch-multicpu.sh
new file mode 100644
index 0000000..0d9bc3a
--- /dev/null
+++ b/src/pytorch/pytorch-multicpu.sh
@@ -0,0 +1,18 @@
+#!/encs/bin/tcsh
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance
+
+#SBATCH --mem=24G
+#SBATCH --time=0:05:00
+#SBATCH --output=%N-%j.out
+
+source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh
+echo "starting training..."
+
+time python torch-multicpu.py
+######## END of Job Script
+
+### Slurm Script execution
+sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one)
+### End of slurm script
\ No newline at end of file
diff --git a/src/pytorch/torch-multicpu.py b/src/pytorch/torch-multicpu.py
new file mode 100644
index 0000000..27721f3
--- /dev/null
+++ b/src/pytorch/torch-multicpu.py
@@ -0,0 +1,91 @@
+### Python script torch-multicpu.py
+
+import numpy as np
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.distributed as distri
+
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.datasets import CIFAR10
+from torch.utils.data import DataLoader
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test')
+parser.add_argument('--lr', default=0.1, help='')
+parser.add_argument('--batch_size', type=int, default=512, help='')
+parser.add_argument('--num_workers', type=int, default=0, help='')
+
+def main():
+
+ args = parser.parse_args()
+ torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK']))
+ class Net(nn.Module):
+
+ def __init__(self):
+ super(Net, self).__init__()
+
+ self.conv1 = nn.Conv2d(3, 6, 5)
+ self.pool = nn.MaxPool2d(2, 2)
+ self.conv2 = nn.Conv2d(6, 16, 5)
+ self.fc1 = nn.Linear(16 * 5 * 5, 120)
+ self.fc2 = nn.Linear(120, 84)
+ self.fc3 = nn.Linear(84, 10)
+
+ def forward(self, x):
+ x = self.pool(F.relu(self.conv1(x)))
+ x = self.pool(F.relu(self.conv2(x)))
+ x = x.view(-1, 16 * 5 * 5)
+ x = F.relu(self.fc1(x))
+ x = F.relu(self.fc2(x))
+ x = self.fc3(x)
+ return x
+
+ net = Net()
+
+ criterion = nn.CrossEntropyLoss()
+ optimizer = optim.SGD(net.parameters(), lr=args.lr)
+
+ transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+ ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data
+ ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from
+ ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data
+
+ dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
+
+ train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
+
+ perf = []
+
+ total_start = time.time()
+
+ for batch_idx, (inputs, targets) in enumerate(train_loader):
+
+ start = time.time()
+
+ outputs = net(inputs)
+ loss = criterion(outputs, targets)
+
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ batch_time = time.time() - start
+
+ images_per_sec = args.batch_size/batch_time
+
+ perf.append(images_per_sec)
+
+ total_time = time.time() - total_start
+
+if __name__=='__main__':
+ main()
+
+### END of Python script
\ No newline at end of file
From be0883c469f5154a578037b729b3ee010f0c9e88 Mon Sep 17 00:00:00 2001
From: Farah <49493059+salhanyf@users.noreply.github.com>
Date: Mon, 17 Jun 2024 17:30:21 -0400
Subject: [PATCH 02/10] readme files
---
src/pytorch/README.md | 4 ++++
src/pytorch/pytorch-multicpu.sh | 7 +------
2 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/src/pytorch/README.md b/src/pytorch/README.md
index e69de29..3c8fe17 100644
--- a/src/pytorch/README.md
+++ b/src/pytorch/README.md
@@ -0,0 +1,4 @@
+
+# README
+
+This directory has example scripts to using Pytorch with Python virtual environment to run on CPUs
\ No newline at end of file
diff --git a/src/pytorch/pytorch-multicpu.sh b/src/pytorch/pytorch-multicpu.sh
index 0d9bc3a..0b71ee5 100644
--- a/src/pytorch/pytorch-multicpu.sh
+++ b/src/pytorch/pytorch-multicpu.sh
@@ -10,9 +10,4 @@
source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh
echo "starting training..."
-time python torch-multicpu.py
-######## END of Job Script
-
-### Slurm Script execution
-sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one)
-### End of slurm script
\ No newline at end of file
+time python torch-multicpu.py
\ No newline at end of file
From 88d822bbad6f3aad927a0fe48719434805d28577 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 19:23:29 -0400
Subject: [PATCH 03/10] [example][pytorch-multicpu] for now name the dir per
example name
---
src/{pytorch => pytorch-multicpu}/README.md | 0
src/{pytorch => pytorch-multicpu}/firsttime.sh | 0
src/{pytorch => pytorch-multicpu}/pytorch-multicpu.sh | 0
src/{pytorch => pytorch-multicpu}/torch-multicpu.py | 0
4 files changed, 0 insertions(+), 0 deletions(-)
rename src/{pytorch => pytorch-multicpu}/README.md (100%)
rename src/{pytorch => pytorch-multicpu}/firsttime.sh (100%)
rename src/{pytorch => pytorch-multicpu}/pytorch-multicpu.sh (100%)
rename src/{pytorch => pytorch-multicpu}/torch-multicpu.py (100%)
diff --git a/src/pytorch/README.md b/src/pytorch-multicpu/README.md
similarity index 100%
rename from src/pytorch/README.md
rename to src/pytorch-multicpu/README.md
diff --git a/src/pytorch/firsttime.sh b/src/pytorch-multicpu/firsttime.sh
similarity index 100%
rename from src/pytorch/firsttime.sh
rename to src/pytorch-multicpu/firsttime.sh
diff --git a/src/pytorch/pytorch-multicpu.sh b/src/pytorch-multicpu/pytorch-multicpu.sh
similarity index 100%
rename from src/pytorch/pytorch-multicpu.sh
rename to src/pytorch-multicpu/pytorch-multicpu.sh
diff --git a/src/pytorch/torch-multicpu.py b/src/pytorch-multicpu/torch-multicpu.py
similarity index 100%
rename from src/pytorch/torch-multicpu.py
rename to src/pytorch-multicpu/torch-multicpu.py
From 97ce6ae729f5a15a42b9aa660392ae7c213341f8 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 19:25:20 -0400
Subject: [PATCH 04/10] [examples][pytorch-multicpu] clean up original .txt
---
src/pytorch-multicpu.txt | 134 ---------------------------------------
1 file changed, 134 deletions(-)
delete mode 100644 src/pytorch-multicpu.txt
diff --git a/src/pytorch-multicpu.txt b/src/pytorch-multicpu.txt
deleted file mode 100644
index 4b10bf8..0000000
--- a/src/pytorch-multicpu.txt
+++ /dev/null
@@ -1,134 +0,0 @@
-## Adapted to SPEED from:
-## https://docs.alliancecan.ca/wiki/PyTorch
-
-# Virtual Environment creation, python script at the end of slurm script execution section
-#### Virtual ENV creation. One time only:
-cd /speed-scratch/$USER
-salloc -p ps --mem=20Gb -A Your_group_id(if you have one)
-module load python/3.9.18/default
-setenv TMP /speed-scratch/$USER/tmp
-setenv TMPDIR /speed-scratch/$USER/tmp
-python -m venv $TMPDIR/pytorchcpu
-source $TMPDIR/pytorchcpu/bin/activate.csh
-pip install torch torchvision
-pip install urllib3==1.26.6
-deactivate
-### END of VEnv creation
-
-
-# job script: pytorch-multicpu.sh
-
-#!/encs/bin/tcsh
-#SBATCH --nodes 1
-#SBATCH --tasks-per-node=1
-#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance
-
-#SBATCH --mem=24G
-#SBATCH --time=0:05:00
-#SBATCH --output=%N-%j.out
-
-source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh
-echo "starting training..."
-
-time python torch-multicpu.py
-######## END of Job Script
-
-### Slurm Script execution
-sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one)
-### End of slurm script
-
-
-
-
-### Python script torch-multicpu.py
-
-import numpy as np
-import time
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import torch.distributed as distri
-
-import torchvision
-import torchvision.transforms as transforms
-from torchvision.datasets import CIFAR10
-from torch.utils.data import DataLoader
-
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test')
-parser.add_argument('--lr', default=0.1, help='')
-parser.add_argument('--batch_size', type=int, default=512, help='')
-parser.add_argument('--num_workers', type=int, default=0, help='')
-
-def main():
-
- args = parser.parse_args()
- torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK']))
- class Net(nn.Module):
-
- def __init__(self):
- super(Net, self).__init__()
-
- self.conv1 = nn.Conv2d(3, 6, 5)
- self.pool = nn.MaxPool2d(2, 2)
- self.conv2 = nn.Conv2d(6, 16, 5)
- self.fc1 = nn.Linear(16 * 5 * 5, 120)
- self.fc2 = nn.Linear(120, 84)
- self.fc3 = nn.Linear(84, 10)
-
- def forward(self, x):
- x = self.pool(F.relu(self.conv1(x)))
- x = self.pool(F.relu(self.conv2(x)))
- x = x.view(-1, 16 * 5 * 5)
- x = F.relu(self.fc1(x))
- x = F.relu(self.fc2(x))
- x = self.fc3(x)
- return x
-
- net = Net()
-
- criterion = nn.CrossEntropyLoss()
- optimizer = optim.SGD(net.parameters(), lr=args.lr)
-
- transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
-
- ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data
- ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from
- ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data
-
- dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
-
- train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
-
- perf = []
-
- total_start = time.time()
-
- for batch_idx, (inputs, targets) in enumerate(train_loader):
-
- start = time.time()
-
- outputs = net(inputs)
- loss = criterion(outputs, targets)
-
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- batch_time = time.time() - start
-
- images_per_sec = args.batch_size/batch_time
-
- perf.append(images_per_sec)
-
- total_time = time.time() - total_start
-
-if __name__=='__main__':
- main()
-
-### END of Python script
-
From ebcdd506fbc115b4b2895e5c44c7f7be876a9972 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 19:26:24 -0400
Subject: [PATCH 05/10] [examples][pytorch-multicpu] chmod u+x
---
src/pytorch-multicpu/firsttime.sh | 0
src/pytorch-multicpu/pytorch-multicpu.sh | 0
src/pytorch-multicpu/torch-multicpu.py | 0
3 files changed, 0 insertions(+), 0 deletions(-)
mode change 100644 => 100755 src/pytorch-multicpu/firsttime.sh
mode change 100644 => 100755 src/pytorch-multicpu/pytorch-multicpu.sh
mode change 100644 => 100755 src/pytorch-multicpu/torch-multicpu.py
diff --git a/src/pytorch-multicpu/firsttime.sh b/src/pytorch-multicpu/firsttime.sh
old mode 100644
new mode 100755
diff --git a/src/pytorch-multicpu/pytorch-multicpu.sh b/src/pytorch-multicpu/pytorch-multicpu.sh
old mode 100644
new mode 100755
diff --git a/src/pytorch-multicpu/torch-multicpu.py b/src/pytorch-multicpu/torch-multicpu.py
old mode 100644
new mode 100755
From 0d5d9f97f61e6065a36a04e02e6037fc2eee4297 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 19:32:08 -0400
Subject: [PATCH 06/10] [examples][pytortch-multicpu] add commands to README
---
src/pytorch-multicpu/README.md | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/pytorch-multicpu/README.md b/src/pytorch-multicpu/README.md
index 3c8fe17..2db83be 100644
--- a/src/pytorch-multicpu/README.md
+++ b/src/pytorch-multicpu/README.md
@@ -1,4 +1,17 @@
-
-# README
-
-This directory has example scripts to using Pytorch with Python virtual environment to run on CPUs
\ No newline at end of file
+
+# README
+
+This directory has example scripts to using Pytorch with Python virtual environment to run on multiple CPUs (most universal).
+Adapted to SPEED from https://docs.alliancecan.ca/wiki/PyTorch
+
+```
+(speed-submit)
+cd /speed-scratch/$USER
+salloc -p ps --mem=20G --mail-type=ALL
+(compute node)
+./firsttime.sh
+exit
+(speed-submit)
+sbatch -p ps pytorch-multicpu.sh
+tail -f slurm-JOBID.out
+```
From 24f7cf0790e3ad5d4c0ebf29ccb1d7d79d3b9988 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 20:20:46 -0400
Subject: [PATCH 07/10] [examples][pytorch-multicpu] adjust scripts per tests
---
src/pytorch-multicpu/firsttime.sh | 9 ++++++---
src/pytorch-multicpu/pytorch-multicpu.sh | 7 ++++++-
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/src/pytorch-multicpu/firsttime.sh b/src/pytorch-multicpu/firsttime.sh
index 490a3f9..6cae1b8 100755
--- a/src/pytorch-multicpu/firsttime.sh
+++ b/src/pytorch-multicpu/firsttime.sh
@@ -1,7 +1,9 @@
-#### Virtual ENV creation. One time only:
#!/encs/bin/tcsh
+
+#### Virtual ENV creation. One time only.
+#### Run after salloc to a compute node!
+
cd /speed-scratch/$USER
-salloc -p ps --mem=20Gb -A Your_group_id(if you have one)
module load python/3.9.18/default
setenv TMP /speed-scratch/$USER/tmp
setenv TMPDIR /speed-scratch/$USER/tmp
@@ -10,4 +12,5 @@ source $TMPDIR/pytorchcpu/bin/activate.csh
pip install torch torchvision
pip install urllib3==1.26.6
deactivate
-### END of VEnv creation
\ No newline at end of file
+
+#### END of VEnv creation
diff --git a/src/pytorch-multicpu/pytorch-multicpu.sh b/src/pytorch-multicpu/pytorch-multicpu.sh
index 0b71ee5..6246bd0 100755
--- a/src/pytorch-multicpu/pytorch-multicpu.sh
+++ b/src/pytorch-multicpu/pytorch-multicpu.sh
@@ -1,4 +1,5 @@
#!/encs/bin/tcsh
+
#SBATCH --nodes 1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance
@@ -7,7 +8,11 @@
#SBATCH --time=0:05:00
#SBATCH --output=%N-%j.out
+#SBATCH --mail-type=ALL
+
source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh
echo "starting training..."
-time python torch-multicpu.py
\ No newline at end of file
+time srun python torch-multicpu.py
+
+# EOF
From 6b0934762cb8ea110cd0662e0104654e73a107d5 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 20:21:25 -0400
Subject: [PATCH 08/10] [examples][torch-multicpu] adjust Python script for
training.
This includes 5 epochs for now instead of 512, some printouts,
and set dataset download to True. Also dos2unix it.
---
src/pytorch-multicpu/torch-multicpu.py | 189 +++++++++++++------------
1 file changed, 98 insertions(+), 91 deletions(-)
diff --git a/src/pytorch-multicpu/torch-multicpu.py b/src/pytorch-multicpu/torch-multicpu.py
index 27721f3..87c52d7 100755
--- a/src/pytorch-multicpu/torch-multicpu.py
+++ b/src/pytorch-multicpu/torch-multicpu.py
@@ -1,91 +1,98 @@
-### Python script torch-multicpu.py
-
-import numpy as np
-import time
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import torch.distributed as distri
-
-import torchvision
-import torchvision.transforms as transforms
-from torchvision.datasets import CIFAR10
-from torch.utils.data import DataLoader
-
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test')
-parser.add_argument('--lr', default=0.1, help='')
-parser.add_argument('--batch_size', type=int, default=512, help='')
-parser.add_argument('--num_workers', type=int, default=0, help='')
-
-def main():
-
- args = parser.parse_args()
- torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK']))
- class Net(nn.Module):
-
- def __init__(self):
- super(Net, self).__init__()
-
- self.conv1 = nn.Conv2d(3, 6, 5)
- self.pool = nn.MaxPool2d(2, 2)
- self.conv2 = nn.Conv2d(6, 16, 5)
- self.fc1 = nn.Linear(16 * 5 * 5, 120)
- self.fc2 = nn.Linear(120, 84)
- self.fc3 = nn.Linear(84, 10)
-
- def forward(self, x):
- x = self.pool(F.relu(self.conv1(x)))
- x = self.pool(F.relu(self.conv2(x)))
- x = x.view(-1, 16 * 5 * 5)
- x = F.relu(self.fc1(x))
- x = F.relu(self.fc2(x))
- x = self.fc3(x)
- return x
-
- net = Net()
-
- criterion = nn.CrossEntropyLoss()
- optimizer = optim.SGD(net.parameters(), lr=args.lr)
-
- transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
-
- ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data
- ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from
- ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data
-
- dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
-
- train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
-
- perf = []
-
- total_start = time.time()
-
- for batch_idx, (inputs, targets) in enumerate(train_loader):
-
- start = time.time()
-
- outputs = net(inputs)
- loss = criterion(outputs, targets)
-
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- batch_time = time.time() - start
-
- images_per_sec = args.batch_size/batch_time
-
- perf.append(images_per_sec)
-
- total_time = time.time() - total_start
-
-if __name__=='__main__':
- main()
-
-### END of Python script
\ No newline at end of file
+### Python script torch-multicpu.py
+
+import numpy as np
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.distributed as distri
+
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.datasets import CIFAR10
+from torch.utils.data import DataLoader
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test')
+parser.add_argument('--lr', default=0.1, help='')
+#parser.add_argument('--batch_size', type=int, default=512, help='')
+parser.add_argument('--batch_size', type=int, default=4, help='')
+#parser.add_argument('--num_workers', type=int, default=0, help='')
+parser.add_argument('--num_workers', type=int, default=4, help='')
+
+def main():
+
+ args = parser.parse_args()
+ torch.set_num_threads(int(os.environ['SLURM_CPUS_PER_TASK']))
+ class Net(nn.Module):
+
+ def __init__(self):
+ super(Net, self).__init__()
+
+ self.conv1 = nn.Conv2d(3, 6, 5)
+ self.pool = nn.MaxPool2d(2, 2)
+ self.conv2 = nn.Conv2d(6, 16, 5)
+ self.fc1 = nn.Linear(16 * 5 * 5, 120)
+ self.fc2 = nn.Linear(120, 84)
+ self.fc3 = nn.Linear(84, 10)
+
+ def forward(self, x):
+ x = self.pool(F.relu(self.conv1(x)))
+ x = self.pool(F.relu(self.conv2(x)))
+ x = x.view(-1, 16 * 5 * 5)
+ x = F.relu(self.fc1(x))
+ x = F.relu(self.fc2(x))
+ x = self.fc3(x)
+ return x
+
+ net = Net()
+
+ criterion = nn.CrossEntropyLoss()
+ optimizer = optim.SGD(net.parameters(), lr=args.lr)
+
+ transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+ ### This next line will attempt to download the CIFAR10 dataset from the internet if you don't already have it stored in ./data
+ ### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from
+ ### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data
+
+ #dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
+ dataset_train = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
+
+ train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
+
+ perf = []
+
+ total_start = time.time()
+
+ for batch_idx, (inputs, targets) in enumerate(train_loader):
+
+ start = time.time()
+
+ outputs = net(inputs)
+ loss = criterion(outputs, targets)
+
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ batch_time = time.time() - start
+
+ images_per_sec = args.batch_size/batch_time
+
+ perf.append(images_per_sec)
+
+ total_time = time.time() - total_start
+
+ print("Total time: ", total_time)
+ print("Perf : ", perf)
+ print("CPUs : ", os.environ['SLURM_CPUS_PER_TASK'])
+
+if __name__=='__main__':
+ main()
+
+### END of Python script
From e5b0cfba969c1d43dacb0395a0a31c01dfd47d57 Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 20:22:50 -0400
Subject: [PATCH 09/10] [examples][pytorch-multicpu][README] adjust a bit with
timing
---
src/pytorch-multicpu/README.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/pytorch-multicpu/README.md b/src/pytorch-multicpu/README.md
index 2db83be..fa5d80b 100644
--- a/src/pytorch-multicpu/README.md
+++ b/src/pytorch-multicpu/README.md
@@ -9,7 +9,8 @@ Adapted to SPEED from https://docs.alliancecan.ca/wiki/PyTorch
cd /speed-scratch/$USER
salloc -p ps --mem=20G --mail-type=ALL
(compute node)
-./firsttime.sh
+time ./firsttime.sh
+(takes about 5 mins)
exit
(speed-submit)
sbatch -p ps pytorch-multicpu.sh
From c81b2d45c29809e8fde5918a1ba3fec01f009fea Mon Sep 17 00:00:00 2001
From: Serguei Mokhov
Date: Mon, 17 Jun 2024 20:26:30 -0400
Subject: [PATCH 10/10] [examples][pytorch-multicpu][README] updae out filename
---
src/pytorch-multicpu/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/pytorch-multicpu/README.md b/src/pytorch-multicpu/README.md
index fa5d80b..83d3b94 100644
--- a/src/pytorch-multicpu/README.md
+++ b/src/pytorch-multicpu/README.md
@@ -14,5 +14,5 @@ time ./firsttime.sh
exit
(speed-submit)
sbatch -p ps pytorch-multicpu.sh
-tail -f slurm-JOBID.out
+tail -f EXECUTIONHOST-JOBID.out
```