Skip to content

Commit

Permalink
Merge pull request #51 from NAG-DevOps/python-example
Browse files Browse the repository at this point in the history
Refactor pytorch-multicpu to be use-ready in a subdirectory
  • Loading branch information
smokhov authored Jun 18, 2024
2 parents 6ac4580 + c81b2d4 commit 3cadb16
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 46 deletions.
18 changes: 18 additions & 0 deletions src/pytorch-multicpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!-- TOC --><a name="README"></a>
# README

This directory has example scripts to using Pytorch with Python virtual environment to run on multiple CPUs (most universal).
Adapted to SPEED from https://docs.alliancecan.ca/wiki/PyTorch

```
(speed-submit)
cd /speed-scratch/$USER
salloc -p ps --mem=20G --mail-type=ALL
(compute node)
time ./firsttime.sh
(takes about 5 mins)
exit
(speed-submit)
sbatch -p ps pytorch-multicpu.sh
tail -f EXECUTIONHOST-JOBID.out
```
16 changes: 16 additions & 0 deletions src/pytorch-multicpu/firsttime.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/encs/bin/tcsh

#### Virtual ENV creation. One time only.
#### Run after salloc to a compute node!

cd /speed-scratch/$USER
module load python/3.9.18/default
setenv TMP /speed-scratch/$USER/tmp
setenv TMPDIR /speed-scratch/$USER/tmp
python -m venv $TMPDIR/pytorchcpu
source $TMPDIR/pytorchcpu/bin/activate.csh
pip install torch torchvision
pip install urllib3==1.26.6
deactivate

#### END of VEnv creation
18 changes: 18 additions & 0 deletions src/pytorch-multicpu/pytorch-multicpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/encs/bin/tcsh

#SBATCH --nodes 1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance

#SBATCH --mem=24G
#SBATCH --time=0:05:00
#SBATCH --output=%N-%j.out

#SBATCH --mail-type=ALL

source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh
echo "starting training..."

time srun python torch-multicpu.py

# EOF
56 changes: 10 additions & 46 deletions src/pytorch-multicpu.txt → src/pytorch-multicpu/torch-multicpu.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,45 +1,3 @@
## Adapted to SPEED from:
## https://docs.alliancecan.ca/wiki/PyTorch

# Virtual Environment creation, python script at the end of slurm script execution section
#### Virtual ENV creation. One time only:
cd /speed-scratch/$USER
salloc -p ps --mem=20Gb -A Your_group_id(if you have one)
module load python/3.9.18/default
setenv TMP /speed-scratch/$USER/tmp
setenv TMPDIR /speed-scratch/$USER/tmp
python -m venv $TMPDIR/pytorchcpu
source $TMPDIR/pytorchcpu/bin/activate.csh
pip install torch torchvision
pip install urllib3==1.26.6
deactivate
### END of VEnv creation


# job script: pytorch-multicpu.sh

#!/encs/bin/tcsh
#SBATCH --nodes 1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=4 # change this parameter to 2,4,6,... to see the effect on performance

#SBATCH --mem=24G
#SBATCH --time=0:05:00
#SBATCH --output=%N-%j.out

source /speed-scratch/$USER/tmp/pytorchcpu/bin/activate.csh
echo "starting training..."

time python torch-multicpu.py
######## END of Job Script

### Slurm Script execution
sbatch -p ps pytorch-multicpu.sh -A Your_group_id(if you have one)
### End of slurm script




### Python script torch-multicpu.py

import numpy as np
Expand All @@ -61,8 +19,10 @@

parser = argparse.ArgumentParser(description='cifar10 classification models, cpu performance test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--batch_size', type=int, default=512, help='')
parser.add_argument('--num_workers', type=int, default=0, help='')
#parser.add_argument('--batch_size', type=int, default=512, help='')
parser.add_argument('--batch_size', type=int, default=4, help='')
#parser.add_argument('--num_workers', type=int, default=0, help='')
parser.add_argument('--num_workers', type=int, default=4, help='')

def main():

Expand Down Expand Up @@ -100,7 +60,8 @@ def forward(self, x):
### Run this line on a login node with "download=True" prior to submitting your job, or manually download the data from
### https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and place it under ./data

dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
#dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
dataset_train = CIFAR10(root='./data', train=True, download=True, transform=transform_train)

train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)

Expand All @@ -127,8 +88,11 @@ def forward(self, x):

total_time = time.time() - total_start

print("Total time: ", total_time)
print("Perf : ", perf)
print("CPUs : ", os.environ['SLURM_CPUS_PER_TASK'])

if __name__=='__main__':
main()

### END of Python script

0 comments on commit 3cadb16

Please sign in to comment.