From 5e0be10c9b893cae8844b707219e334625e6ea62 Mon Sep 17 00:00:00 2001 From: Farah <49493059+salhanyf@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:43:39 -0500 Subject: [PATCH 1/9] Intro section updated --- doc/01-introduction/introduction-main.tex | 0 doc/speed-manual.tex | 280 +++------------------- 2 files changed, 27 insertions(+), 253 deletions(-) create mode 100644 doc/01-introduction/introduction-main.tex diff --git a/doc/01-introduction/introduction-main.tex b/doc/01-introduction/introduction-main.tex new file mode 100644 index 0000000..e69de29 diff --git a/doc/speed-manual.tex b/doc/speed-manual.tex index ca8a243..a359bcb 100644 --- a/doc/speed-manual.tex +++ b/doc/speed-manual.tex @@ -11,6 +11,9 @@ % For multicolumn itemized lists \usepackage{multicol} +% For newline, simply insert an empty line +\usepackage{parskip} + % Down to the level of the paragraph (4) \setcounter{secnumdepth}{4} \setcounter{tocdepth}{4} @@ -25,12 +28,11 @@ \input{commands} %% Document -%% +% ------------------------------------------------------------------------------ \begin{document} -% ------------------------------------------------------------------------------ %% Front Matter -%% +% ------------------------------------------------------------------------------ % Regular title as in the article class. % \title{Speed: The GCS ENCS Cluster} @@ -38,7 +40,7 @@ % \titlerunning{} has to be set to either the main title or its shorter % version for the running heads. Use {\sf} for highlighting your system % name, application, or a tool. -% + \titlerunning{Speed: The GCS ENCS Cluster} % Previously VI @@ -48,12 +50,11 @@ %\date{\textbf{Version 7.0-dev-01}} %\date{\textbf{Version 7.0}} %\date{\textbf{Version 7.1}} -\date{\textbf{Version 7.2}} +%\date{\textbf{Version 7.2}} +\date{\textbf{Version 7.3}} % Authors are joined by \and and their affiliations are on the -% subsequent lines separated by \\ just like the article class -% allows. -% +% subsequent lines separated by \\ just like the article class allows. \author{ Serguei A. Mokhov \and @@ -81,17 +82,20 @@ \indexedauthor{Salhany, Farah} \indexedauthor{NAG/HPC} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \maketitle -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - +%% +% ------------------------------------------------------------------------------ +% Abstract % ------------------------------------------------------------------------------ \begin{abstract} -This document serves as a quick start guide to using the Gina Cody School of Engineering and Computer Science (GCS ENCS) -compute server farm, known as ``Speed.'' Managed by the HPC/NAG group of the -Academic Information Technology Services (AITS) at GCS, Concordia University, Montreal, Canada. +This document serves as a quick start guide to using the Gina Cody School of +Engineering and Computer Science (GCS ENCS) compute server farm, known as ``Speed.'' +Managed by the HPC/NAG group of the Academic Information Technology Services (AITS) at GCS, +Concordia University, Montreal, Canada. \end{abstract} +% ------------------------------------------------------------------------------ +% Table of Contents % ------------------------------------------------------------------------------ \tableofcontents \clearpage @@ -101,245 +105,15 @@ % ------------------------------------------------------------------------------ \section{Introduction} \label{sect:introduction} - -This document contains basic information required to use ``Speed'', along with tips, -tricks, examples, and references to projects and papers that have used Speed. -User contributions of sample jobs and/or references are welcome.\\ - -\noindent -\textbf{Note:} On October 20, 2023, we completed the migration to SLURM -from Grid Engine (UGE/AGE) as our job scheduler. -This manual has been updated to use SLURM's syntax and commands. -If you are a long-time GE user, refer to \xa{appdx:uge-to-slurm} for key highlights needed to -translate your GE jobs to SLURM as well as environment changes. -These changes are also elaborated throughout this document and our examples. - -% ------------------------------------------------------------------------------ -\subsection{Citing Us} -\label{sect:citing-speed-hpc} - -If you wish to cite this work in your acknowledgements, you can use our general DOI found on our GitHub page -\url{https://dx.doi.org/10.5281/zenodo.5683642} or a specific version of the manual and scripts from that link individually. -You can also use the ``cite this repository'' feature of GitHub. - -% ----------------------------- 1.1 Resources ---------------------------------- -% ------------------------------------------------------------------------------ -\subsection{Resources} -\label{sect:resources} - -\begin{itemize} - \item - Public GitHub page where the manual and sample job scripts are maintained at\\ - \url{https://github.com/NAG-DevOps/speed-hpc} - \begin{itemize} - \item Pull requests (PRs) are subject to review and are welcome:\\ - \url{https://github.com/NAG-DevOps/speed-hpc/pulls} - \end{itemize} - - \item - Speed Manual: - \begin{itemize} - \item PDF version of the manual:\\ - \url{https://github.com/NAG-DevOps/speed-hpc/blob/master/doc/speed-manual.pdf} - - \item HTML version of the manual:\\ - \url{https://nag-devops.github.io/speed-hpc/} - \end{itemize} - - \item - Concordia official page for ``Speed'' cluster, which includes access request instructions. - \url{https://www.concordia.ca/ginacody/aits/speed.html} - - \item - All Speed users are subscribed to the \texttt{hpc-ml} mailing list. - -\end{itemize} - -% TODO: for now comment out for 7.0; if when we update that -% preso, we will re-link it here. However, keep the citation. -\nocite{speed-intro-preso} -%\item -%\href -% {https://docs.google.com/presentation/d/1zu4OQBU7mbj0e34Wr3ILXLPWomkhBgqGZ8j8xYrLf44} -% {Speed Server Farm Presentation 2022}~\cite{speed-intro-preso}. - -% ----------------------------- 1.2 Team --------------------------------------- -% ------------------------------------------------------------------------------ -\subsection{Team} -\label{sect:speed-team} - -Speed is supported by: -\begin{itemize} - \item - Serguei Mokhov, PhD, Manager, Networks, Security and HPC, AITS - \item - Gillian Roper, Senior Systems Administrator, HPC, AITS - \item - Carlos Alarcón Meza, Systems Administrator, HPC and Networking, AITS - \item - Farah Salhany, IT Instructional Specialist, AITS -\end{itemize} - -\noindent We receive support from the rest of AITS teams, such as NAG, SAG, FIS, and DOG.\\ -\url{https://www.concordia.ca/ginacody/aits.html} - - -% ----------------------------- 1.3 What Speed Consists of --------------------- -% ------------------------------------------------------------------------------ -\subsection{What Speed Consists of} -\label{sect:speed-arch} - -\begin{itemize} - \item - Twenty four (24) 32-core compute nodes, each with 512~GB of memory and - approximately 1~TB of local volatile-scratch disk space (pictured in \xf{fig:speed-pics}). - - \item - Twelve (12) NVIDIA Tesla P6 GPUs, with 16~GB of GPU memory (compatible with the - CUDA, OpenGL, OpenCL, and Vulkan APIs). - - \item - 4 VIDPRO nodes (ECE. Dr.~Amer), with 6 P6 cards, and 6 V100 cards (32GB), and - 256GB of RAM. - - \item - 7 new SPEED2 servers with 256 CPU cores each 4x~A100 80~GB GPUs, partitioned - into 4x~20GB MIGs each; larger local storage for TMPDIR (see \xf{fig:speed-architecture-full}). - - \item - One AMD FirePro S7150 GPU, with 8~GB of memory (compatible with the - Direct~X, OpenGL, OpenCL, and Vulkan APIs). - - \item - Salus compute node (CSSE CLAC, Drs.~Bergler and Kosseim), 56 cores and 728GB of RAM, see \xf{fig:speed-architecture-full}. - - \item - Magic subcluster partition (ECE, Dr.~Khendek, 11 nodes, see \xf{fig:speed-architecture-full}). - - \item - Nebular subcluster partition (CIISE, Drs.~Yan, Assi, Ghafouri, et al., Nebulae GPU node with 2x RTX 6000 Ada 48GB cards, - Stellar compute node, and Matrix 177TB storage/compute node, see \xf{fig:speed-architecture-full}). -\end{itemize} - -\begin{figure}[htpb] - \centering - \includegraphics[width=\columnwidth]{images/speed-pics} - \caption{Speed} - \label{fig:speed-pics} -\end{figure} - -\begin{figure}[htpb] - \centering - \includegraphics[width=\columnwidth]{images/speed-architecture-full} - \caption{Speed Cluster Hardware Architecture} - \label{fig:speed-architecture-full} -\end{figure} - -\begin{figure}[htpb] - \centering - \includegraphics[width=\columnwidth]{images/slurm-arch} - \caption{Speed SLURM Architecture} - \label{fig:slurm-arch} -\end{figure} - -% ----------------------------- 1.4 What Speed Is Ideal For -------------------- -% ------------------------------------------------------------------------------ -\subsection{What Speed Is Ideal For} -\label{sect:speed-is-for} - -\begin{itemize} - \item - Design, develop, test, and run parallel, batch, and other algorithms and scripts with partial data sets. - ``Speed'' has been optimized for compute jobs that are multi-core aware, - require a large memory space, or are iteration intensive. - - \item - Prepare jobs for large clusters such as: - \begin{itemize} - \item Digital Research Alliance of Canada (Calcul Quebec and Compute Canada) - \item Cloud platforms - \end{itemize} - \item - Jobs that are too demanding for a desktop. - \item - Single-core batch jobs; multithreaded jobs typically up to 32 cores (i.e., a single machine). - \item - Multi-node multi-core jobs (MPI). - \item - Anything that can fit into a 500-GB memory space and a \textbf{speed scratch} space of approximately 10~TB. - \item - CPU-based jobs. - \item - CUDA GPU jobs. - \item - Non-CUDA GPU jobs using OpenCL. -\end{itemize} - -% ----------------------------- 1.5 What Speed Is Not -------------------------- -% ------------------------------------------------------------------------------ -\subsection{What Speed Is Not} -\label{sect:speed-is-not} - -\begin{itemize} - \item Speed is not a web host and does not host websites. - \item Speed is not meant for Continuous Integration (CI) automation deployments for Ansible or similar tools. - \item Does not run Kubernetes or other container orchestration software. - \item Does not run Docker. (\textbf{Note:} Speed does run Singularity and many Docker containers can be converted to Singularity - containers with a single command. See \xs{sect:singularity-containers}.) - \item Speed is not for jobs executed outside of the scheduler. (Jobs running outside of the scheduler will be killed and all data lost.) -\end{itemize} - -% ----------------------------- 1.6 Available Software ------------------------- -% ------------------------------------------------------------------------------ -\subsection{Available Software} -\label{sect:available-software} - -There are a wide range of open-source and commercial software available and installed on ``Speed.'' -This includes Abaqus~\cite{abaqus}, AllenNLP, Anaconda, ANSYS, Bazel, -COMSOL, CPLEX, CUDA, Eclipse, Fluent~\cite{fluent}, Gurobi, MATLAB~\cite{matlab,scholarpedia-matlab}, -OMNeT++, OpenCV, OpenFOAM, OpenMPI, OpenPMIx, ParaView, PyTorch, QEMU, R, Rust, and Singularity among others. -Programming environments include various versions of Python, C++/Java compilers, TensorFlow, OpenGL, OpenISS, and {\marf}~\cite{marf}.\\ - -In particular, there are over 2200 programs available in \texttt{/encs/bin} and \texttt{/encs/pkg} under Scientific Linux 7 (EL7). -We are building an equivalent array of programs for the EL9 SPEED2 nodes. To see the packages available, run \texttt{ls -al /encs/pkg/} on \texttt{speed.encs}. -See a complete list in \xa{sect:software-details}.\\ - -\noindent -\textbf{Note:} We do our best to accommodate custom software requests. Python environments can use user-custom installs -from within the scratch directory. - -% ----------------------------- 1.7 Requesting Access -------------------------- -% ------------------------------------------------------------------------------ -\subsection{Requesting Access} -\label{sect:access-requests} - -After reviewing the ``What Speed is'' (\xs{sect:speed-is-for}) and -``What Speed is Not'' (\xs{sect:speed-is-not}), request access to the ``Speed'' -cluster by emailing: \texttt{rt-ex-hpc AT encs.concordia.ca}. - -\begin{itemize} - \item GCS ENCS faculty and staff may request access directly. - \item GCS students must include the following in their request message: - \begin{itemize} - \item GCS ENCS username - \item Name and email (CC) of the approver -- either a supervisor, course instructor, - or a department representative (e.g., in the case of undergraduate or M.Eng.\ students it - can be the Chair, associate chair, a technical officer, or a department administrator) for approval. - \item Written request from the - %supervisor or instructor - approver - for the GCS ENCS username to be granted access to ``Speed.'' - \end{itemize} - \item Non-GCS students taking a GCS course will have their GCS ENCS account created automatically, but still need the course instructor's approval to use the service. - \item Non-GCS faculty and students need to get a ``sponsor'' within GCS, so that a guest GCS ENCS account is created first. A sponsor can be any GCS Faculty member - you collaborate with. Failing that, request the approval from our Dean's Office; - via our Associate Deans Drs.~Eddie Hoi Ng or Emad Shihab. - \item External entities collaborating with GCS Concordia researchers should also go through the Dean's Office for approvals. -\end{itemize} - -% The web page is currently less detailed than the above. -%For detailed instructions, refer to the Concordia -%\href{https://www.concordia.ca/ginacody/aits/speed.html}{Computing (HPC) Facility: Speed} webpage. +\input{01-introduction/introduction-main.tex} +% Includes: +% 1.1 Citing US +% 1.2 Resources +% 1.3 Team +% 1.4 What Speed Consists of +% 1.5 What Speed Is Ideal For +% 1.6 What Speed Is Not +% 1.7 Available Software % ------------------------------------------------------------------------------ % 2 Job Management From 0d32b438e5ee7715a10642715154c8c7b51e6164 Mon Sep 17 00:00:00 2001 From: Farah <49493059+salhanyf@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:45:17 -0500 Subject: [PATCH 2/9] intro-main.tex content --- doc/01-introduction/introduction-main.tex | 220 ++++++++++++++++++++++ 1 file changed, 220 insertions(+) diff --git a/doc/01-introduction/introduction-main.tex b/doc/01-introduction/introduction-main.tex index e69de29..c84aa1d 100644 --- a/doc/01-introduction/introduction-main.tex +++ b/doc/01-introduction/introduction-main.tex @@ -0,0 +1,220 @@ +% ------------------------------------------------------------------------------ +% 1 Introduction +% ------------------------------------------------------------------------------ + +This document contains basic information required to use ``Speed'', along with tips, +tricks, examples, and references to projects and papers that have used Speed. +User contributions of sample jobs and/or references are welcome. + +\noindent \textbf{Note:} +On October 20, 2023, we completed the migration to SLURM from Grid Engine (UGE/AGE) as our job scheduler. +This manual has been updated to use SLURM's syntax and commands. +If you are a long-time GE user, refer to \xa{appdx:uge-to-slurm} for key highlights needed to +translate your GE jobs to SLURM as well as environment changes. +These changes are also elaborated throughout this document and our examples. + +% 1.1 Citing US +% ------------------------------------------------------------- +\subsection{Citing Us} +\label{sect:citing-speed-hpc} + +If you wish to cite this work in your acknowledgements, you can use our general DOI found on our GitHub page +\url{https://dx.doi.org/10.5281/zenodo.5683642} or a specific version of the manual and scripts from that link individually. +You can also use the ``cite this repository'' feature of GitHub. + +% 1.2 Resources +% ------------------------------------------------------------- +\subsection{Resources} +\label{sect:resources} + +\begin{itemize} + \item + Public GitHub page where the manual and sample job scripts are maintained at\\ + \url{https://github.com/NAG-DevOps/speed-hpc} + \begin{itemize} + \item Pull requests (PRs) are subject to review and are welcome:\\ + \url{https://github.com/NAG-DevOps/speed-hpc/pulls} + \end{itemize} + + \item + Speed Manual: + \begin{itemize} + \item PDF version of the manual: + \url{https://github.com/NAG-DevOps/speed-hpc/blob/master/doc/speed-manual.pdf} + \item HTML version of the manual: + \url{https://nag-devops.github.io/speed-hpc/} + \end{itemize} + + \item + Concordia official page for ``Speed'' cluster, which includes access request instructions. + \url{https://www.concordia.ca/ginacody/aits/speed.html} + + \item + All Speed users are subscribed to the \texttt{hpc-ml} mailing list. +\end{itemize} + +% 1.3 Team +% ------------------------------------------------------------- +\subsection{Team} +\label{sect:speed-team} + +Speed is supported by: +\begin{itemize} + \item + Serguei Mokhov, PhD, Manager, Networks, Security and HPC, AITS + \item + Gillian Roper, Senior Systems Administrator, HPC, AITS + \item + Carlos Alarcón Meza, Systems Administrator, HPC and Networking, AITS + \item + Farah Salhany, IT Instructional Specialist, AITS +\end{itemize} + +We receive support from the rest of AITS teams, such as NAG, SAG, FIS, and DOG. + +\url{https://www.concordia.ca/ginacody/aits.html} + +% 1.4 What Speed Consists of +% ------------------------------------------------------------- +\subsection{What Speed Consists of} +\label{sect:speed-arch} + +\begin{itemize} + \item Twenty four (24) 32-core compute nodes, each with 512~GB of memory and + approximately 1~TB of local volatile-scratch disk space (pictured in \xf{fig:speed-pics}). + + \item Twelve (12) NVIDIA Tesla P6 GPUs, with 16~GB of GPU memory (compatible with the + CUDA, OpenGL, OpenCL, and Vulkan APIs). + + \item 4 VIDPRO nodes (ECE. Dr.~Amer), with 6 P6 cards, and 6 V100 cards (32GB), and + 256GB of RAM. + + \item 7 new SPEED2 servers with 256 CPU cores each 4x~A100 80~GB GPUs, partitioned + into 4x~20GB MIGs each; larger local storage for TMPDIR (see \xf{fig:speed-architecture-full}). + + \item One AMD FirePro S7150 GPU, with 8~GB of memory (compatible with the + Direct~X, OpenGL, OpenCL, and Vulkan APIs). + + \item Salus compute node (CSSE CLAC, Drs.~Bergler and Kosseim), 56 cores and 728GB of RAM, + see \xf{fig:speed-architecture-full}. + + \item Magic subcluster partition (ECE, Dr.~Khendek, 11 nodes, see \xf{fig:speed-architecture-full}). + + \item Nebular subcluster partition (CIISE, Drs.~Yan, Assi, Ghafouri, et al., Nebulae GPU node with 2x RTX 6000 Ada 48GB cards, + Stellar compute node, and Matrix 177TB storage/compute node, see \xf{fig:speed-architecture-full}). +\end{itemize} + +\begin{figure}[htpb] + \centering + \includegraphics[width=\columnwidth]{images/speed-pics} + \caption{Speed} + \label{fig:speed-pics} +\end{figure} + +\begin{figure}[htpb] + \centering + \includegraphics[width=\columnwidth]{images/speed-architecture-full} + \caption{Speed Cluster Hardware Architecture} + \label{fig:speed-architecture-full} +\end{figure} + +\begin{figure}[htpb] + \centering + \includegraphics[width=\columnwidth]{images/slurm-arch} + \caption{Speed SLURM Architecture} + \label{fig:slurm-arch} +\end{figure} + +% 1.5 What Speed Is Ideal For +% ------------------------------------------------------------- +\subsection{What Speed Is Ideal For} +\label{sect:speed-is-for} + +\begin{itemize} + \item + Design, develop, test, and run parallel, batch, and other algorithms and scripts with partial data sets. + ``Speed'' has been optimized for compute jobs that are multi-core aware, + require a large memory space, or are iteration intensive. + + \item + Prepare jobs for large clusters such as: + \begin{itemize} + \item Digital Research Alliance of Canada (Calcul Quebec and Compute Canada) + \item Cloud platforms + \end{itemize} + \item + Jobs that are too demanding for a desktop. + \item + Single-core batch jobs; multithreaded jobs typically up to 32 cores (i.e., a single machine). + \item + Multi-node multi-core jobs (MPI). + \item + Anything that can fit into a 500-GB memory space and a \textbf{speed scratch} space of approximately 10~TB. + \item + CPU-based jobs. + \item + CUDA GPU jobs. + \item + Non-CUDA GPU jobs using OpenCL. +\end{itemize} + +% 1.6 What Speed Is Not +% ------------------------------------------------------------- +\subsection{What Speed Is Not} +\label{sect:speed-is-not} + +\begin{itemize} + \item Speed is not a web host and does not host websites. + \item Speed is not meant for Continuous Integration (CI) automation deployments for Ansible or similar tools. + \item Does not run Kubernetes or other container orchestration software. + \item Does not run Docker. (\textbf{Note:} Speed does run Singularity and many Docker containers can be converted to + Singularity containers with a single command. See \xs{sect:singularity-containers}.) + \item Speed is not for jobs executed outside of the scheduler. (Jobs running outside of the scheduler will be killed and all data lost.) +\end{itemize} + +% 1.7 Available Software +% ------------------------------------------------------------- +\subsection{Available Software} +\label{sect:available-software} + +There are a wide range of open-source and commercial software available and installed on ``Speed.'' +This includes Abaqus~\cite{abaqus}, AllenNLP, Anaconda, ANSYS, Bazel, +COMSOL, CPLEX, CUDA, Eclipse, Fluent~\cite{fluent}, Gurobi, MATLAB~\cite{matlab,scholarpedia-matlab}, +OMNeT++, OpenCV, OpenFOAM, OpenMPI, OpenPMIx, ParaView, PyTorch, QEMU, R, Rust, and Singularity among others. +Programming environments include various versions of Python, C++/Java compilers, TensorFlow, OpenGL, OpenISS, and {\marf}~\cite{marf}. + +In particular, there are over 2200 programs available in \texttt{/encs/bin} and \texttt{/encs/pkg} under Scientific Linux 7 (EL7). +We are building an equivalent array of programs for the EL9 SPEED2 nodes. To see the packages available, run \texttt{ls -al /encs/pkg/} on \texttt{speed.encs}. +See a complete list in \xa{sect:software-list}. + +\noindent\textbf{Note:} We do our best to accommodate custom software requests. +Python environments can use user-custom installs from within scratch directory. + +% 1.8 Requesting Access +% ------------------------------------------------------------------------------ +\subsection{Requesting Access} +\label{sect:access-requests} + +After reviewing the ``What Speed is'' (\xs{sect:speed-is-for}) and +``What Speed is Not'' (\xs{sect:speed-is-not}), request access to the ``Speed'' +cluster by emailing: \texttt{rt-ex-hpc AT encs.concordia.ca}. + +\begin{itemize} + \item GCS ENCS faculty and staff may request access directly. + \item GCS students must include the following in their request message: + \begin{itemize} + \item GCS ENCS username + \item Name and email (CC) of the approver -- either a supervisor, course instructor, + or a department representative (e.g., in the case of undergraduate or M.Eng.\ students it + can be the Chair, associate chair, a technical officer, or a department administrator) for approval. + \item Written request from the approver for the GCS ENCS username to be granted access to ``Speed.'' + \end{itemize} + \item Non-GCS students taking a GCS course will have their GCS ENCS account created automatically, but still need the course instructor's approval to use the service. + \item Non-GCS faculty and students need to get a ``sponsor'' within GCS, so that a guest GCS ENCS account is created first. A sponsor can be any GCS Faculty member + you collaborate with. Failing that, request the approval from our Dean's Office; + via our Associate Deans Drs.~Eddie Hoi Ng or Emad Shihab. + \item External entities collaborating with GCS Concordia researchers should also go through the Dean's Office for approvals. +\end{itemize} + +For detailed instructions, refer to the Concordia +\href{https://www.concordia.ca/ginacody/aits/speed.html}{Computing (HPC) Facility: Speed} webpage. \ No newline at end of file From 0314146c196a891f0a0f19e198de6a64a78653d8 Mon Sep 17 00:00:00 2001 From: Farah <49493059+salhanyf@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:47:02 -0500 Subject: [PATCH 3/9] Conclusion section updated --- doc/03-conclusion/conclusion-main.tex | 123 ++++++++++++++++++++++ doc/speed-manual.tex | 141 ++------------------------ 2 files changed, 131 insertions(+), 133 deletions(-) create mode 100644 doc/03-conclusion/conclusion-main.tex diff --git a/doc/03-conclusion/conclusion-main.tex b/doc/03-conclusion/conclusion-main.tex new file mode 100644 index 0000000..1431062 --- /dev/null +++ b/doc/03-conclusion/conclusion-main.tex @@ -0,0 +1,123 @@ +% ------------------------------------------------------------------------------ +% 3 Conclusion +% ------------------------------------------------------------------------------ + +The cluster operates on a ``first-come, first-served'' basis until it reaches full capacity. +After that, job positions in the queue are determined based on past usage. +The scheduler does attempt to fill gaps, so occasionally, a single-core job with lower priority +may be scheduled before a multi-core job with higher priority. + +% 3.1 Important Limitations +% ------------------------------------------------------------- +\subsection{Important Limitations} +\label{sect:limitations} + +While Speed is a powerful tool, it is essential to recognize its limitations to use it effectively: + +\begin{itemize} + \item New users are limited to a total of 32 cores and 4 GPUs. If you need more cores temporarily, + please contact \texttt{rt-ex-hpc AT encs.concordia.ca}. + + \item Batch job sessions can run for a maximum of one week. + Interactive jobs are limited to 24 hours see \xs{sect:interactive-jobs}. + + \item Scripts can live in your NFS-provided home directory, but substantial data should be stored in + your cluster-specific directory (located at \verb+/speed-scratch//+). + NFS is suitable for short-term activities but not for long-term operations. + Data that a job will read multiple times should be copied at the start to the scratch disk of a compute node using + \api{\$TMPDIR} (and possibly \api{\$SLURM\_SUBMIT\_DIR}). + Intermediate job data should be produced in \api{\$TMPDIR}, and once a job is near completion, + these data should be copied to your NFS-mounted home directory (or other NFS-mounted space). + \textbf{In other words}, IO-intensive operations should be performed locally whenever possible, + reserving network activity for the start and end of jobs. + + \item Your current resource allocation is based on past usage, which considers approximately + one week's worth of past wall clock time (time spent on the node(s)) and compute activity (on the node(s)). + + \item Jobs must always be run within the scheduler's system. Repeat offenders who + run jobs outside the scheduler risk losing cluster access. +\end{itemize} + +% 3.2 Tips/Tricks +% ------------------------------------------------------------- +\subsection{Tips/Tricks} +\label{sect:tips} + +\begin{itemize} + \item + Ensure that files and scripts have Linux line breaks. + Use the \tool{file} command to verify and \tool{dos2unix} to convert if necessary. + + \item + Use \tool{rsync} (preferred over \tool{scp}) for copying or moving large amounts of data. + + \item + Before transferring a large number of files between NFS-mounted storage and + the cluster, compress the files into a \tool{tar} archive. + + \item + If you plan to use a different shell (e.g., \tool{bash}~\cite{aosa-book-vol1-bash}), + change the shell declaration at the beginning of your script(s). + + \item + Request resources (cores, memory, GPUs) that closely match the actual needs of your job. + Requesting significantly more than necessary can make your job harder to schedule when + resources are limited. Always check the efficiency of your job with either \tool{seff} + and/or the \option{--mail-type=ALL}, to adjust your job parameters. + + \item + For any concerns or questions, email \texttt{rt-ex-hpc AT encs.concordia.ca} +\end{itemize} + +% 3.3 Use Cases +% ------------------------------------------------------------- +\subsection{Use Cases} +\label{sect:cases} + +\begin{itemize} + \item HPC Committee's initial batch about 6 students (end of 2019): + \begin{itemize} + \item 10000 iterations job in Fluent finished in $<26$ hours vs. 46 hours in Calcul Quebec + \end{itemize} + + \item NAG's MAC spoofer analyzer~\cite{mac-spoofer-analyzer-intro-c3s2e2014,mac-spoofer-analyzer-detail-fps2014}, + such as \url{https://github.com/smokhov/atsm/tree/master/examples/flucid} + \begin{itemize} + \item compilation of forensic computing reasoning cases about false or true positives of hardware address spoofing in the labs + \end{itemize} + + \item S4 LAB/GIPSY R\&D Group's: + \begin{itemize} + \item MARFCAT and MARFPCAT (OSS signal processing and machine learning tools for + vulnerable and weak code analysis and network packet capture + analysis)~\cite{marfcat-nlp-ai2014,marfcat-sate2010-nist,fingerprinting-mal-traffic} + \item Web service data conversion and analysis + \item {\flucid} encoders (translation of large log data into {\flucid}~\cite{mokhov-phd-thesis-2013} for forensic analysis) + \item Genomic alignment exercises + \end{itemize} + + \item \textbf{Best Paper award}, \bibentry{job-failure-prediction-compsysarch2024} + + % RT521027 + \item \bibentry{unsteady-wake-ouedraogo_essel_2023} + \item \bibentry{effects-reynolds-ouedraogo_essel_2024} + \item \bibentry{nozzle-effects-APS_2024} + \item \bibentry{effects-reynolds-APS-ouedraogo_essel_2024} + \item \bibentry{oi-containers-poster-siggraph2023} + \item \bibentry{Gopal2024Sep} + \item \bibentry{Gopal2023Mob} + % the next one is not visible (it produces an error) + %\item \bibentry{roof-mounted-vawt-2023} + \item \bibentry{root-mounted-vawt-corner-2023} + \item \bibentry{cfd-modeling-turbine-2023} + \item \bibentry{small-vaxis-turbine-corner-2022} + \item \bibentry{cfd-vaxis-turbine-wake-2022} + \item \bibentry{numerical-turbulence-vawt-2021} + \item \bibentry{niksirat2020} + \item The work ``\bibentry{lai-haotao-mcthesis19}'' using TensorFlow and Keras on OpenISS adjusted to run on + Speed based on the repositories, and theirs forks by the team: + \begin{itemize} + \item \bibentry{openiss-reid-tfk} and + \item \bibentry{openiss-yolov3} + \end{itemize} +\end{itemize} \ No newline at end of file diff --git a/doc/speed-manual.tex b/doc/speed-manual.tex index a359bcb..6567d55 100644 --- a/doc/speed-manual.tex +++ b/doc/speed-manual.tex @@ -560,145 +560,20 @@ \subsubsection{Python} % ------------------------------------------------------------------------------ \section{Conclusion} \label{sect:conclusion} +\input{03-conclusion/conclusion-main.tex} -The cluster operates on a ``first-come, first-served'' basis until it reaches full capacity. -After that, job positions in the queue are determined based on past usage. -The scheduler does attempt to fill gaps, so occasionally, a single-core job with lower priority -may be scheduled before a multi-core job with higher priority. - -% -------------- 3.1 Important Limitations -------------------- -% ------------------------------------------------------------- -\subsection{Important Limitations} -\label{sect:limitations} - -While Speed is a powerful tool, it is essential to recognize its limitations to use it effectively: - -\begin{itemize} - \item - New users are limited to a total of 32 cores and 4 GPUs. If you need more cores temporarily, - %(up to 192 cores or six jobs of 32 cores each), - please contact \texttt{rt-ex-hpc AT encs.concordia.ca}. - - \item - Batch job sessions can run for a maximum of one week. - Interactive jobs are limited to 24 hours see \xs{sect:interactive-jobs}. - - \item - Scripts can live in your NFS-provided home directory, but substantial data - should be stored in your cluster-specific directory (located at \verb+/speed-scratch//+). - - NFS is suitable for short-term activities but not for long-term operations. - \textbf{Data that a job will read multiple times} should be copied at the start to the scratch disk of a compute node using - \api{\$TMPDIR} (and possibly \api{\$SLURM\_SUBMIT\_DIR}). - Intermediate job data should be produced in \api{\$TMPDIR}, and once a job is near completion, - these data should be copied to your NFS-mounted home directory (or other NFS-mounted space). - \textbf{In other words, IO-intensive operations should be performed locally whenever possible, - reserving network activity for the start and end of jobs.} - - \item - Your current resource allocation is based on past usage, - which considers approximately one week's worth of past wall clock time - (time spent on the node(s)) and compute activity (on the node(s)). - - \item - Jobs must always be run within the scheduler's system. Repeat offenders who - run jobs outside the scheduler risk losing cluster access. -\end{itemize} - -% -------------- 3.2 Tips/Tricks ------------------------------ -% ------------------------------------------------------------- -\subsection{Tips/Tricks} -\label{sect:tips} - -\begin{itemize} - \item - Ensure that files and scripts have Linux line breaks. - Use the \tool{file} command to verify and \tool{dos2unix} to convert if necessary. - - \item - Use \tool{rsync} (preferred over \tool{scp}) for copying or moving large amounts of data. - - \item - Before transferring a large number of files between NFS-mounted storage and - the cluster, compress the files into a \tool{tar} archive. - - \item - If you plan to use a different shell (e.g., \tool{bash}~\cite{aosa-book-vol1-bash}), - change the shell declaration at the beginning of your script(s). - - \item - Request resources (cores, memory, GPUs) that closely match the actual needs of your job. - Requesting significantly more than necessary can make your job harder to schedule when - resources are limited. Always check the efficiency of your job with either \tool{seff} - and/or the \option{--mail-type=ALL}, to adjust your job parameters. - - \item - For any concerns or questions, email \texttt{rt-ex-hpc AT encs.concordia.ca} -\end{itemize} - -% -------------- 3.3 Use Cases -------------------------------- -% ------------------------------------------------------------- -\subsection{Use Cases} -\label{sect:cases} - -\begin{itemize} - \item - HPC Committee's initial batch about 6 students (end of 2019): - \begin{itemize} - \item 10000 iterations job in Fluent finished in $<26$ hours vs. 46 hours in Calcul Quebec - \end{itemize} - - \item - NAG's MAC spoofer analyzer~\cite{mac-spoofer-analyzer-intro-c3s2e2014,mac-spoofer-analyzer-detail-fps2014}, - such as \url{https://github.com/smokhov/atsm/tree/master/examples/flucid} - \begin{itemize} - \item compilation of forensic computing reasoning cases about false or true positives of hardware address spoofing in the labs - \end{itemize} +% Includes: +% 3.1 Important Limitations +% 3.2 Tips/Tricks +% 3.3 Use Cases - \item - S4 LAB/GIPSY R\&D Group's: - \begin{itemize} - \item MARFCAT and MARFPCAT (OSS signal processing and machine learning tools for - vulnerable and weak code analysis and network packet capture - analysis)~\cite{marfcat-nlp-ai2014,marfcat-sate2010-nist,fingerprinting-mal-traffic} - \item Web service data conversion and analysis - \item {\flucid} encoders (translation of large log data into {\flucid}~\cite{mokhov-phd-thesis-2013} for forensic analysis) - \item Genomic alignment exercises - \end{itemize} - - \item \textbf{Best Paper award}, \bibentry{job-failure-prediction-compsysarch2024} - - % RT521027 - \item \bibentry{unsteady-wake-ouedraogo_essel_2023} - \item \bibentry{effects-reynolds-ouedraogo_essel_2024} - \item \bibentry{nozzle-effects-APS_2024} - \item \bibentry{effects-reynolds-APS-ouedraogo_essel_2024} - - \item \bibentry{oi-containers-poster-siggraph2023} - - \item \bibentry{Gopal2024Sep} - \item \bibentry{Gopal2023Mob} - % the next one is not visible (it produces an error) - %\item \bibentry{roof-mounted-vawt-2023} - \item \bibentry{root-mounted-vawt-corner-2023} - \item \bibentry{cfd-modeling-turbine-2023} - \item \bibentry{small-vaxis-turbine-corner-2022} - \item \bibentry{cfd-vaxis-turbine-wake-2022} - \item \bibentry{numerical-turbulence-vawt-2021} - \item \bibentry{niksirat2020} - - \item The work ``\bibentry{lai-haotao-mcthesis19}'' using TensorFlow and Keras on OpenISS - adjusted to run on Speed based on the repositories: - \begin{itemize} - \item \bibentry{openiss-reid-tfk} and - \item \bibentry{openiss-yolov3} - \end{itemize} - and theirs forks by the team. -\end{itemize} +% ------------------------------------------------------------------------------ +% Appendix % ------------------------------------------------------------------------------ \appendix + % ------------------------------------------------------------------------------ % A History % ------------------------------------------------------------------------------ From dc872dc60e65b314e783f172c7960ae677417df0 Mon Sep 17 00:00:00 2001 From: Farah <49493059+salhanyf@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:04:09 -0500 Subject: [PATCH 4/9] appendix section updated --- doc/04-appendix/faq.tex | 225 +++++++++++++++++++++++ doc/04-appendix/history.tex | 114 ++++++++++++ doc/04-appendix/sister-facilities.tex | 68 +++++++ doc/{ => 04-appendix}/software-list.tex | 32 ++-- doc/speed-manual.tex | 235 ++++-------------------- 5 files changed, 454 insertions(+), 220 deletions(-) create mode 100644 doc/04-appendix/faq.tex create mode 100644 doc/04-appendix/history.tex create mode 100644 doc/04-appendix/sister-facilities.tex rename doc/{ => 04-appendix}/software-list.tex (97%) diff --git a/doc/04-appendix/faq.tex b/doc/04-appendix/faq.tex new file mode 100644 index 0000000..37b1b3a --- /dev/null +++ b/doc/04-appendix/faq.tex @@ -0,0 +1,225 @@ +% ----------------------------------------------------------------------------- +% B Frequently Asked Questions +% ----------------------------------------------------------------------------- + +% B.1 Where do I learn about Linux? +% ------------------------------------------------------------- +\subsection{Where do I learn about Linux?} +\label{sect:faqs-linux} + +All Speed users are expected to have a basic understanding of Linux and its commonly used commands. +Here are some recommended resources: + +\paragraph*{Software Carpentry}: +Software Carpentry provides free resources to learn software, including a workshop on the Unix shell. +Visit \href{https://software-carpentry.org/lessons/}{Software Carpentry Lessons} to learn more. + +\paragraph*{Udemy}: +There are numerous Udemy courses, including free ones, that will help you learn Linux. +Active Concordia faculty, staff and students have access to Udemy courses. +A recommended starting point for beginners is the course ``Linux Mastery: Master the Linux Command Line in 11.5 Hours''. +Visit \href{https://www.concordia.ca/it/services/udemy.html}{Concordia's Udemy page} to learn how Concordians can access Udemy. + +% B.2 How to bash shell on Speed? +% ------------------------------------------------------------- +\subsection{How to use bash shell on Speed?} +\label{sect:faqs-bash} + +This section provides comprehensive instructions on how to utilize the bash shell on the Speed cluster. + +\subsubsection{How do I set bash as my login shell?} +To set your default login shell to bash on Speed, your login shell on all GCS servers must be changed to bash. +To make this change, create a ticket with the Service Desk (or email \texttt{help at concordia.ca}) to +request that bash become your default login shell for your ENCS user account on all GCS servers. + +\subsubsection{How do I move into a bash shell on Speed?} +To move to the bash shell, type \textbf{bash} at the command prompt: +\begin{verbatim} + [speed-submit] [/home/a/a_user] > bash + bash-4.4$ echo $0 + bash +\end{verbatim} +\noindent\textbf{Note} how the command prompt changes from +``\verb![speed-submit] [/home/a/a_user] >!'' to ``\verb!bash-4.4$!'' after entering the bash shell. + +\subsubsection{How do I use the bash shell in an interactive session on Speed?} +Below are examples of how to use \tool{bash} as a shell in your interactive job sessions +with both the \tool{salloc} and \tool{srun} commands. +\begin{itemize} + \item \texttt{salloc -ppt --mem=100G -N 1 -n 10 /encs/bin/bash} + \item \texttt{srun --mem=50G -n 5 --pty /encs/bin/bash} +\end{itemize} +\noindent\textbf{Note:} Make sure the interactive job requests memory, cores, etc. + +\subsubsection{How do I run scripts written in bash on \tool{Speed}?} +To execute bash scripts on Speed: +\begin{enumerate} + \item Ensure that the shebang of your bash job script is \verb+#!/encs/bin/bash+ + \item Use the \tool{sbatch} command to submit your job script to the scheduler. +\end{enumerate} +\noindent Check Speed GitHub for a \href{https://github.com/NAG-DevOps/speed-hpc/blob/master/src/bash.sh}{sample bash job script}. + +% B.3 How to resolve “Disk quota exceeded” errors? +% ------------------------------------------------------------- +\subsection{How to resolve ``Disk quota exceeded'' errors?} +\label{sect:quota-exceeded} + +\subsubsection{Probable Cause} +The ``\texttt{Disk quota exceeded}'' error occurs when your application has +run out of disk space to write to. On \tool{Speed}, this error can be returned when: +\begin{enumerate} + \item The NFS-provided home is full and cannot be written to. + You can verify this using the \tool{quota} and \tool{bigfiles} commands. + \item The ``\texttt{/tmp}'' directory on the speed node where your application is running is full and cannot be written to. +\end{enumerate} + +\subsubsection{Possible Solutions} +\begin{enumerate} + \item Use the \option{--chdir} job script option to set the job working directory. + This is the directory where the job will write output files. + + \item Although local disk space is recommended for IO-intensive operations, the + `\texttt{/tmp}' directory on \tool{Speed} nodes is limited to 1TB, so it may be necessary + to store temporary data elsewhere. Review the documentation for each module + used in your script to determine how to set working directories. + The basic steps are: + \begin{itemize} + \item + Determine how to set working directories for each module used in your job script. + \item + Create a working directory in \tool{speed-scratch} for output files: + \begin{verbatim} + mkdir -m 750 /speed-scratch/$USER/output + \end{verbatim} + \item + Create a subdirectory for recovery files: + \begin{verbatim} + mkdir -m 750 /speed-scratch/$USER/recovery + \end{verbatim} + \item + Update the job script to write output to the directories created in your \tool{speed-scratch} directory, + e.g., \verb!/speed-scratch/$USER/output!. + \end{itemize} +\end{enumerate} +\noindent In the above example, \verb!$USER! is an environment variable containing your ENCS username. + +\subsubsection{Example of setting working directories for \tool{COMSOL}} +\begin{itemize} + \item Create directories for recovery, temporary, and configuration files. + \begin{verbatim} + mkdir -m 750 -p /speed-scratch/$USER/comsol/{recovery,tmp,config} + \end{verbatim} + \item Add the following command switches to the COMSOL command to use the directories created above: + \begin{verbatim} + -recoverydir /speed-scratch/$USER/comsol/recovery + -tmpdir /speed-scratch/$USER/comsol/tmp + -configuration/speed-scratch/$USER/comsol/config + \end{verbatim} +\end{itemize} +\noindent In the above example, \verb!$USER! is an environment variable containing your ENCS username. + +\subsubsection{Example of setting working directories for \tool{Python Modules}} +By default when adding a Python module, the \texttt{/tmp} directory is set as the temporary repository for files downloads. +The size of the \texttt{/tmp} directory on \verb!speed-submit! is too small for PyTorch. +To add a Python module +\begin{itemize} + \item Create your own tmp directory in your \verb!speed-scratch! directory: + \begin{verbatim} + mkdir /speed-scratch/$USER/tmp + \end{verbatim} + \item Use the temporary directory you created + \begin{verbatim} + setenv TMPDIR /speed-scratch/$USER/tmp + \end{verbatim} + \item Attempt the installation of PyTorch +\end{itemize} +\noindent In the above example, \verb!$USER! is an environment variable containing your ENCS username. + +% B.4 How do I check my job's status? +% ------------------------------------------------------------- +\subsection{How do I check my job's status?} +\label{sect:faq-job-status} + +When a job with a job ID of 1234 is running or terminated, you can track its status using the following commands to check its status: +\begin{itemize} + \item Use the ``sacct'' command to view the status of a job: + \begin{verbatim} + sacct -j 1234 + \end{verbatim} + \item Use the ``squeue'' command to see if the job is sitting in the queue: + \begin{verbatim} + squeue -j 1234 + \end{verbatim} + \item Use the ``sstat'' command to find long-term statistics on the job after it has terminated + and the \tool{slurmctld} has purged it from its tracking state into the database: + \begin{verbatim} + sstat -j 1234 + \end{verbatim} +\end{itemize} + +% B.5 Why is my job pending when nodes are empty? +% ------------------------------------------------------------- +\subsection{Why is my job pending when nodes are empty?} + +\subsubsection{Disabled nodes} +It is possible that one or more of the Speed nodes are disabled for maintenance. +To verify if Speed nodes are disabled, check if they are in a draining or drained state: + +\small +\begin{verbatim} +[serguei@speed-submit src] % sinfo --long --Node +Thu Oct 19 21:25:12 2023 +NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON +speed-01 1 pa idle 32 2:16:1 257458 0 1 gpu16 none +speed-03 1 pa idle 32 2:16:1 257458 0 1 gpu32 none +speed-05 1 pg idle 32 2:16:1 515490 0 1 gpu16 none +speed-07 1 ps* mixed 32 2:16:1 515490 0 1 cpu32 none +speed-08 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-09 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-10 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-11 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-12 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-15 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-16 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-17 1 pg drained 32 2:16:1 515490 0 1 gpu16 UGE +speed-19 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-20 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-21 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-22 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-23 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-24 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-25 1 pg idle 32 2:16:1 257458 0 1 gpu32 none +speed-25 1 pa idle 32 2:16:1 257458 0 1 gpu32 none +speed-27 1 pg idle 32 2:16:1 257458 0 1 gpu32 none +speed-27 1 pa idle 32 2:16:1 257458 0 1 gpu32 none +speed-29 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-30 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-31 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-32 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-33 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-34 1 ps* idle 32 2:16:1 515490 0 1 cpu32 none +speed-35 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-36 1 ps* drained 32 2:16:1 515490 0 1 cpu32 UGE +speed-37 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +speed-38 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +speed-39 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +speed-40 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +speed-41 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +speed-42 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +speed-43 1 pt idle 256 2:64:2 980275 0 1 gpu20,mi none +\end{verbatim} +\normalsize + +\noindent Note which nodes are in the state of \textbf{drained}. +The reason for the drained state can be found in the \textbf{reason} column. +Your job will run once an occupied node becomes availble or the maintenance is completed, +and the disabled nodes have a state of \textbf{idle}. + +\subsubsection{Error in job submit request.} +It is possible that your job is pending because it requested resources that are not available within Speed. +To verify why job ID 1234 is not running, execute: +\begin{verbatim} + sacct -j 1234 +\end{verbatim} + +\noindent A summary of the reasons can be obtained via the \tool{squeue} command. \ No newline at end of file diff --git a/doc/04-appendix/history.tex b/doc/04-appendix/history.tex new file mode 100644 index 0000000..a45a5dd --- /dev/null +++ b/doc/04-appendix/history.tex @@ -0,0 +1,114 @@ +% ----------------------------------------------------------------------------- +% A History +% ----------------------------------------------------------------------------- + +% A.1 Acknowledgments +% ------------------------------------------------------------- +\subsection{Acknowledgments} +\label{sect:acks} + +\begin{itemize} + \item + The first 6 to 6.5 versions of this manual and early UGE job script samples, Singularity testing,and user support + were produced/done by Dr.~Scott Bunnell during his time at Concordia as a part of the NAG/HPC group. + We thank him for his contributions. + \item + The HTML version with devcontainer support was contributed by Anh H Nguyen. + \item + Dr.~Tariq Daradkeh, was our IT Instructional Specialist from August 2022 to September 2023; + working on the scheduler, scheduling research, end user support, and integration of + examples, such as YOLOv3 in \xs{sect:openiss-yolov3} and other tasks. We have a continued + collaboration on HPC/scheduling research (see~\cite{job-failure-prediction-compsysarch2024}). +\end{itemize} + +% A.2 Migration from UGE to SLURM +% ------------------------------------------------------------- +\subsection{Migration from UGE to SLURM} +\label{appdx:uge-to-slurm} + +For long term users who started off with Grid Engine here are some resources +to make a transition and mapping to the job submission process. + +\begin{itemize} +\item +Queues are called ``partitions'' in SLURM. Our mapping from the GE queues to SLURM partitions is as follows: +\begin{verbatim} + GE => SLURM + s.q ps + g.q pg + a.q pa +\end{verbatim} +We also have a new partition \texttt{pt} that covers SPEED2 nodes, which previously did not exist. + +\item +Commands and command options mappings are found in \xf{fig:rosetta-mappings} from:\\ +\url{https://slurm.schedmd.com/rosetta.pdf}\\ +\url{https://slurm.schedmd.com/pdfs/summary.pdf}\\ +Other related helpful resources from similar organizations who either used SLURM for a while or also transitioned to it:\\ +\url{https://docs.alliancecan.ca/wiki/Running_jobs}\\ +\url{https://www.depts.ttu.edu/hpcc/userguides/general_guides/Conversion_Table_1.pdf}\\ +\url{https://docs.mpcdf.mpg.de/doc/computing/clusters/aux/migration-from-sge-to-slurm} + +\begin{figure}[htpb] + \includegraphics[width=\columnwidth]{images/rosetta-mapping} + \caption{Rosetta Mappings of Scheduler Commands from SchedMD} + \label{fig:rosetta-mappings} +\end{figure} + +\item +\textbf{NOTE:} If you have used UGE commands in the past you probably still have these +lines there; \textbf{they should now be removed}, as they have no use in SLURM and +will start giving ``command not found'' errors on login when the software is removed: + +csh/\tool{tcsh}: sample \file{.tcshrc} file: +\begin{verbatim} + # Speed environment set up + if ($HOSTNAME == speed-submit.encs.concordia.ca) then + source /local/pkg/uge-8.6.3/root/default/common/settings.csh + endif +\end{verbatim} + +Bourne shell/\tool{bash}: sample \file{.bashrc} file: +\begin{verbatim} + # Speed environment set up + if [ $HOSTNAME = "speed-submit.encs.concordia.ca" ]; then + . /local/pkg/uge-8.6.3/root/default/common/settings.sh + printenv ORGANIZATION | grep -qw ENCS || . /encs/Share/bash/profile + fi +\end{verbatim} + +\textbf{IMPORTANT NOTE:} you will need to either log out and back in, or execute a new shell, +for the environment changes in the updated \file{.tcshrc} or \file{.bashrc} file to be applied. +\end{itemize} + +% A.3 Phases +% ------------------------------------------------------------- +\subsection{Phases} +\label{sect:phases} + +Brief summary of Speed evolution phases: + +\subsubsection{Phase 5} +Phase 5 saw incorporation of the Salus, Magic, and Nebular +subclusters (see \xf{fig:speed-architecture-full}). + +\subsubsection{Phase 4} +Phase 4 had 7 SuperMicro servers with 4x A100 80GB GPUs each added, +dubbed as ``SPEED2''. We also moved from Grid Engine to SLURM. + +\subsubsection{Phase 3} +Phase 3 had 4 vidpro nodes added from Dr.~Amer totalling 6x P6 and 6x V100 +GPUs added. + +\subsubsection{Phase 2} +Phase 2 saw 6x NVIDIA Tesla P6 added and 8x more compute nodes. +The P6s replaced 4x of FirePro S7150. + +\subsubsection{Phase 1} +Phase 1 of Speed was of the following configuration: +\begin{itemize} + \item + Sixteen, 32-core nodes, each with 512~GB of memory and approximately 1~TB of volatile-scratch disk space. + \item + Five AMD FirePro S7150 GPUs, with 8~GB of memory (compatible with the Direct X, OpenGL, OpenCL, and Vulkan APIs). +\end{itemize} \ No newline at end of file diff --git a/doc/04-appendix/sister-facilities.tex b/doc/04-appendix/sister-facilities.tex new file mode 100644 index 0000000..7ea8723 --- /dev/null +++ b/doc/04-appendix/sister-facilities.tex @@ -0,0 +1,68 @@ +% ----------------------------------------------------------------------------- +% C Sister Facilities +% ----------------------------------------------------------------------------- + +Below is a list of resources and facilities similar to Speed at various capacities. +Depending on your research group and needs, they might be available to you. They +are not managed by HPC/NAG of AITS, so contact their respective representatives. + +\begin{itemize} + \item + \texttt{computation.encs} is a CPU-only 3-machine cluster running longer jobs without + a scheduler at the moment. Shares the same EL7 software tree as Speed's EL7 nodes + as well as lab desktops. See \url{https://www.concordia.ca/ginacody/aits/public-servers.html}. + + \item + \texttt{apini.encs} cluster for teaching and MPI programming (see the corresponding + course in CSSE), managed by CSSE. + + \item + Computer Science and Software Engineering (CSSE) Virya GPU Cluster. For CSSE + members only. The cluster has 4 nodes with total of 32 NVIDIA GPUs (a mix of + V100s and A100s). To request access send email to \texttt{virya.help AT concordia.ca}. + This includes an Atlas Analytics partition of Dr.~Mahdi Husseini. + + \item + Dr.~Eugene Belilovsky hightower Exxact, and megatower graphcore clusters. + + \item + Dr.~Maria Amer's VidPro group's nodes in Speed (-01, -03, -25, -27) with additional V100 and P6 GPUs. + + \item + There are various Lambda Labs other GPU servers and like computers + acquired by individual researchers; if you are member of their + research group, contact them directly. These resources are not + managed by us. + + \begin{itemize} + \item Dr.~Amin Hammad's \texttt{construction.encs} Lambda Labs station + \item Dr.~Hassan Rivaz's \texttt{impactlab.encs} Lambda Labs station + \item Dr.~Nizar Bouguila's \texttt{xailab.encs} Lambda Labs station + \item Dr.~Roch Glitho's \texttt{femto.encs} server + \item Dr.~Maria Amer's \texttt{venom.encs} Lambda Labs station + \item Dr.~Leon Wang's \texttt{guerrera.encs} DGX station + \end{itemize} + + \item + Dr.~Ivan Contreras' 4 Operations Research group servers (managed by AITS). + + \item + If you are a member of School of Health (formerly PERFORM Center), + you may have access to their local + \href{https://perform-wiki.concordia.ca/mediawiki/index.php/HPC_Cluster}{PERFORM's High Performance Computing (HPC) Cluster}. + Contact Thomas Beaudry for details and how to obtain access. + + \item + All Concordia students have access to the Library's small + \href{https://library.concordia.ca/technology/sandbox/}{Technology Sandbox} + testing cluster that also runs Slurm. Email \texttt{sean.cooney AT concordia.ca} for details. + + \item + Digital Research Alliance Canada (Compute Canada / Calcul Quebec),\\ + \url{https://alliancecan.ca/}. Follow + \href{https://alliancecan.ca/en/services/advanced-research-computing/account-management/apply-account}{this link} + on the information how to obtain access (students need to be sponsored + by their supervising faculty members, who should create accounts first). + Their SLURM examples are here: \url{https://docs.alliancecan.ca/wiki/Running_jobs} + +\end{itemize} \ No newline at end of file diff --git a/doc/software-list.tex b/doc/04-appendix/software-list.tex similarity index 97% rename from doc/software-list.tex rename to doc/04-appendix/software-list.tex index 889ffff..ef7d265 100644 --- a/doc/software-list.tex +++ b/doc/04-appendix/software-list.tex @@ -1,28 +1,25 @@ % ----------------------------------------------------------------------------- +% D Software List +% ----------------------------------------------------------------------------- % ./generate-software-list.sh -\section{Software Installed On Speed} -\label{sect:software-details} This is a generated section by a script; last updated on \textit{Tue Jul 23 10:48:52 PM EDT 2024}. -We have two major software trees: Scientific Linux 7 (EL7), which is -outgoing, and AlmaLinux 9 (EL9). After major synchronization of software -packages is complete, we will stop maintaining the EL7 tree and -will migrate the remaining nodes to EL9. +We have two major software trees: Scientific Linux 7 (EL7), which is outgoing, +and AlmaLinux 9 (EL9). After major synchronization of software packages is complete, +we will stop maintaining the EL7 tree and will migrate the remaining nodes to EL9. -Use \option{--constraint=el7} to select EL7-only installed nodes for their -software packages. Conversely, use \option{--constraint=el9} for the EL9-only -software. These options would be used as a part of your job parameters -in either \api{\#SBATCH} or on the command line. +Use \option{--constraint=el7} to select EL7-only installed nodes for their software packages. +Conversely, use \option{--constraint=el9} for the EL9-only software. +These options would be used as a part of your job parameters in either \api{\#SBATCH} or on the command line. -\noindent -\textbf{NOTE:} this list does not include packages installed directly on the OS (yet). +\noindent\textbf{NOTE:} this list does not include packages installed directly on the OS (yet). -% ----------------------------------------------------------------------------- +% D.1 EL7 +% ------------------------------------------------------------- \subsection{EL7} \label{sect:software-el7} -Not all packages are intended for HPC, but the common tree is available -on Speed as well as teaching labs' desktops. +Not all packages are intended for HPC, but the common tree is available on Speed as well as teaching labs' desktops. \scriptsize \begin{multicols}{3} @@ -986,7 +983,8 @@ \subsection{EL7} \end{multicols} \normalsize -% ----------------------------------------------------------------------------- +% D.2 EL9 +% ------------------------------------------------------------- \subsection{EL9} \label{sect:software-el9} @@ -1184,4 +1182,4 @@ \subsection{EL9} \end{multicols} \normalsize -% EOF +% EOF \ No newline at end of file diff --git a/doc/speed-manual.tex b/doc/speed-manual.tex index 6567d55..50f77e1 100644 --- a/doc/speed-manual.tex +++ b/doc/speed-manual.tex @@ -574,227 +574,55 @@ \section{Conclusion} \appendix -% ------------------------------------------------------------------------------ -% A History -% ------------------------------------------------------------------------------ +% A History +% ------------------------------------------------------------- \section{History} +\label{sect:history} +\input{04-appendix/history.tex} +% Includes: % A.1 Acknowledgments -% ------------------------------------------------------------- -\subsection{Acknowledgments} -\label{sect:acks} - -\begin{itemize} - \item -The first 6 to 6.5 versions of this manual and early UGE job script samples, -Singularity testing and user support were produced/done by Dr.~Scott Bunnell -during his time at Concordia as a part of the NAG/HPC group. We thank -him for his contributions. - \item -The HTML version with devcontainer support was contributed by Anh H Nguyen. - \item -Dr.~Tariq Daradkeh, was our IT Instructional Specialist from August 2022 to September 2023; -working on the scheduler, scheduling research, end user support, and integration of -examples, such as YOLOv3 in \xs{sect:openiss-yolov3} and other tasks. We have a continued -collaboration on HPC/scheduling research (see~\cite{job-failure-prediction-compsysarch2024}). -\end{itemize} - % A.2 Migration from UGE to SLURM -% ------------------------------------------------------------- -\subsection{Migration from UGE to SLURM} -\label{appdx:uge-to-slurm} - -For long term users who started off with Grid Engine here are some resources -to make a transition and mapping to the job submission process. - -\begin{itemize} -\item -Queues are called ``partitions'' in SLURM. Our mapping from the GE queues -to SLURM partitions is as follows: -\begin{verbatim} -GE => SLURM -s.q ps -g.q pg -a.q pa -\end{verbatim} -We also have a new partition \texttt{pt} that covers SPEED2 nodes, -which previously did not exist. - -\item -Commands and command options mappings are found in \xf{fig:rosetta-mappings} from\\ -\url{https://slurm.schedmd.com/rosetta.pdf}\\ -\url{https://slurm.schedmd.com/pdfs/summary.pdf}\\ -Other related helpful resources from similar organizations who either used -SLURM for a while or also transitioned to it:\\ -\url{https://docs.alliancecan.ca/wiki/Running_jobs}\\ -\url{https://www.depts.ttu.edu/hpcc/userguides/general_guides/Conversion_Table_1.pdf}\\ -\url{https://docs.mpcdf.mpg.de/doc/computing/clusters/aux/migration-from-sge-to-slurm} - -\begin{figure}[htpb] - \includegraphics[width=\columnwidth]{images/rosetta-mapping} - \caption{Rosetta Mappings of Scheduler Commands from SchedMD} - \label{fig:rosetta-mappings} -\end{figure} - -\item -\noindent -\textbf{NOTE:} If you have used UGE commands in the past you probably still have these -lines there; \textbf{they should now be removed}, as they have no use in SLURM and -will start giving ``command not found'' errors on login when the software is removed: - -csh/\tool{tcsh}: sample \file{.tcshrc} file: -\begin{verbatim} -# Speed environment set up -if ($HOSTNAME == speed-submit.encs.concordia.ca) then - source /local/pkg/uge-8.6.3/root/default/common/settings.csh -endif -\end{verbatim} - -Bourne shell/\tool{bash}: sample \file{.bashrc} file: -\begin{verbatim} -# Speed environment set up -if [ $HOSTNAME = "speed-submit.encs.concordia.ca" ]; then - . /local/pkg/uge-8.6.3/root/default/common/settings.sh - printenv ORGANIZATION | grep -qw ENCS || . /encs/Share/bash/profile -fi -\end{verbatim} - -\textbf{IMPORTANT NOTE:} you will need to either log out and back in, or execute a new shell, -for the environment changes in the updated \file{.tcshrc} or \file{.bashrc} file to be applied. - -\end{itemize} - % A.3 Phases -% ------------------------------------------------------------- -\subsection{Phases} -\label{sect:phases} - -Brief summary of Speed evolution phases. - -% ------------------------------------------------------------------------------ -\subsubsection{Phase 5} - -Phase 5 saw incorporation of the Salus, Magic, and Nebular -subclusters (see \xf{fig:speed-architecture-full}). - -% ------------------------------------------------------------------------------ -\subsubsection{Phase 4} - -Phase 4 had 7 SuperMicro servers with 4x A100 80GB GPUs each added, -dubbed as ``SPEED2''. We also moved from Grid Engine to SLURM. - -% ------------------------------------------------------------------------------ -\subsubsection{Phase 3} - -Phase 3 had 4 vidpro nodes added from Dr.~Amer totalling 6x P6 and 6x V100 -GPUs added. - -% ------------------------------------------------------------------------------ -\subsubsection{Phase 2} - -Phase 2 saw 6x NVIDIA Tesla P6 added and 8x more compute nodes. -The P6s replaced 4x of FirePro S7150. -% ------------------------------------------------------------------------------ -\subsubsection{Phase 1} -Phase 1 of Speed was of the following configuration: +% B Frequently Asked Questions +% ------------------------------------------------------------- +\section{Frequently Asked Questions} +\label{sect:faqs} +\input{04-appendix/faq.tex} +% TO DELETE +%\input{scheduler-faq} -\begin{itemize} -\item -Sixteen, 32-core nodes, each with 512~GB of memory and approximately 1~TB of volatile-scratch disk space. -\item -Five AMD FirePro S7150 GPUs, with 8~GB of memory (compatible with the Direct X, OpenGL, OpenCL, and Vulkan APIs). -\end{itemize} +% Includes: +% B.1 Where do I learn about Linux? +% B.2 How to bash shell on Speed? +% B.3 How to resolve “Disk quota exceeded” errors? +% B.4 How do I check my job's status? +% B.5 Why is my job pending when nodes are empty? -% ------------------------------------------------------------------------------ -% B Frequently Asked Questions -% ------------------------------------------------------------------------------ -% TMP scheduler-specific section -\input{scheduler-faq} -% ------------------------------------------------------------------------------ -% C Sister Facilities -% ------------------------------------------------------------------------------ +% C Sister Facilities +% ------------------------------------------------------------- \section{Sister Facilities} \label{sect:sister-facilities} +\input{04-appendix/sister-facilities.tex} -Below is a list of resources and facilities similar to Speed at various capacities. -Depending on your research group and needs, they might be available to you. They -are not managed by HPC/NAG of AITS, so contact their respective representatives. -\begin{itemize} -\item -\texttt{computation.encs} is a CPU-only 3-machine cluster running longer jobs -without a scheduler at the moment. Shares the same EL7 software tree as Speed's EL7 nodes -as well as lab desktops. -See \url{https://www.concordia.ca/ginacody/aits/public-servers.html}. -\item -\texttt{apini.encs} cluster for teaching and MPI programming (see the corresponding -course in CSSE), managed by CSSE -\item -Computer Science and Software Engineering (CSSE) Virya GPU Cluster. For CSSE -members only. The cluster has 4 nodes with total of 32 NVIDIA GPUs (a mix of -V100s and A100s). To request access send email to \texttt{virya.help AT concordia.ca}. -This includes an Atlas Analytics partition of Dr.~Mahdi Husseini. -\item -Dr.~Eugene Belilovsky hightower Exxact, and megatower graphcore clusters. -\item -Dr.~Maria Amer's VidPro group's nodes in Speed (-01, -03, -25, -27) with additional V100 and P6 GPUs. -\item -There are various Lambda Labs other GPU servers and like computers -acquired by individual researchers; if you are member of their -research group, contact them directly. These resources are not -managed by us. -\begin{itemize} -\item -Dr.~Amin Hammad's \texttt{construction.encs} Lambda Labs station -\item -Dr.~Hassan Rivaz's \texttt{impactlab.encs} Lambda Labs station -\item -Dr.~Nizar Bouguila's \texttt{xailab.encs} Lambda Labs station -\item -Dr.~Roch Glitho's \texttt{femto.encs} server -\item -Dr.~Maria Amer's \texttt{venom.encs} Lambda Labs station -\item -Dr.~Leon Wang's \texttt{guerrera.encs} DGX station -\end{itemize} -\item -Dr.~Ivan Contreras' 4 Operations Research group servers (managed by AITS). -\item -If you are a member of School of Health (formerly PERFORM Center), -you may have access to their local -\href - {https://perform-wiki.concordia.ca/mediawiki/index.php/HPC_Cluster} - {PERFORM's High Performance Computing (HPC) Cluster}. -Contact Thomas Beaudry for details and how to obtain access. -\item -All Concordia students have access to the Library's small -\href - {https://library.concordia.ca/technology/sandbox/} - {Technology Sandbox} -testing cluster that also runs Slurm. Email \texttt{sean.cooney AT concordia.ca} for details. -\item -Digital Research Alliance Canada (Compute Canada / Calcul Quebec),\\ -\url{https://alliancecan.ca/}. Follow -\href - {https://alliancecan.ca/en/services/advanced-research-computing/account-management/apply-account} - {this link} -on the information how to obtain access (students need to be sponsored -by their supervising faculty members, who should create accounts -first). Their SLURM examples are here: \url{https://docs.alliancecan.ca/wiki/Running_jobs} +% D Software List +% ------------------------------------------------------------- +\section{Software Installed On Speed} +\label{sect:software-list} +\input{04-appendix/software-list.tex} -\end{itemize} +% Includes: +% D.1 EL7 +% D.2 EL9 -% ------------------------------------------------------------------------------ -% Software List -% ------------------------------------------------------------------------------ -\input{software-list} % ------------------------------------------------------------------------------ -% Refs: -% +% References +% ------------------------------------------------------------------------------ \nocite{aosa-book-vol1} \label{sect:bib} %\bibliographystyle{IEEEtran} @@ -807,5 +635,6 @@ \section{Sister Facilities} \addcontentsline{toc}{section}{Annotated Bibliography} \bibliography{speed-manual} + %------------------------------------------------------------------------------ \end{document} From 2dd8335bfd0c6eee6237f7628329ce06e71fb041 Mon Sep 17 00:00:00 2001 From: Farah Salhany Date: Thu, 19 Dec 2024 16:47:43 -0500 Subject: [PATCH 5/9] job management section updated --- doc/02-job-management/2.1-getting-started.tex | 73 +++ doc/02-job-management/2.10-ssh-keys.tex | 36 ++ .../2.11-creating-virtual-envs.tex | 134 +++++ doc/02-job-management/2.12-ex-fluent.tex | 22 + doc/02-job-management/2.13-ex-efficiendet.tex | 42 ++ doc/02-job-management/2.14-java-jobs.tex | 19 + .../2.15-scheduling-on-gpus.tex | 236 +++++++++ .../2.16-singularity-containers.tex | 87 ++++ .../2.2-job-submission-basics.tex | 201 ++++++++ .../2.3-sample-job-script.tex | 96 ++++ .../2.4-common-job-commands.tex | 111 +++++ .../2.5-advanced-sbatch-options.tex | 57 +++ doc/02-job-management/2.6-array-jobs.tex | 57 +++ .../2.7-requesting-multiple-cores.tex | 63 +++ .../2.8-interactive-jobs.tex | 368 ++++++++++++++ .../2.9-scheduler-env-variables.tex | 56 +++ doc/02-job-management/job-management-main.tex | 131 +++++ doc/speed-manual.tex | 458 ++---------------- 18 files changed, 1819 insertions(+), 428 deletions(-) create mode 100644 doc/02-job-management/2.1-getting-started.tex create mode 100644 doc/02-job-management/2.10-ssh-keys.tex create mode 100644 doc/02-job-management/2.11-creating-virtual-envs.tex create mode 100644 doc/02-job-management/2.12-ex-fluent.tex create mode 100644 doc/02-job-management/2.13-ex-efficiendet.tex create mode 100644 doc/02-job-management/2.14-java-jobs.tex create mode 100644 doc/02-job-management/2.15-scheduling-on-gpus.tex create mode 100644 doc/02-job-management/2.16-singularity-containers.tex create mode 100644 doc/02-job-management/2.2-job-submission-basics.tex create mode 100644 doc/02-job-management/2.3-sample-job-script.tex create mode 100644 doc/02-job-management/2.4-common-job-commands.tex create mode 100644 doc/02-job-management/2.5-advanced-sbatch-options.tex create mode 100644 doc/02-job-management/2.6-array-jobs.tex create mode 100644 doc/02-job-management/2.7-requesting-multiple-cores.tex create mode 100644 doc/02-job-management/2.8-interactive-jobs.tex create mode 100644 doc/02-job-management/2.9-scheduler-env-variables.tex create mode 100644 doc/02-job-management/job-management-main.tex diff --git a/doc/02-job-management/2.1-getting-started.tex b/doc/02-job-management/2.1-getting-started.tex new file mode 100644 index 0000000..b5389b0 --- /dev/null +++ b/doc/02-job-management/2.1-getting-started.tex @@ -0,0 +1,73 @@ +% 2.1 Getting Started +% ------------------------------------------------------------- +\subsection{Getting Started} +\label{sect:getting-started} + +Before getting started, please review the ``What Speed is'' (\xs{sect:speed-is-for}) +and ``What Speed is Not'' (\xs{sect:speed-is-not}). +Once your GCS ENCS account has been granted access to ``Speed'', +use your GCS ENCS account credentials to create an SSH connection to +\texttt{speed} (an alias for \texttt{speed-submit.encs.concordia.ca}). + +All users are expected to have a basic understanding of +Linux and its commonly used commands (see \xa{sect:faqs} for resources). + +% 2.1.1 SSH Connection +% ----------------------- +\subsubsection{SSH Connections} +\label{sect:ssh-connection} + +Requirements to create SSH connection to ``Speed'': +\begin{enumerate} + \item \textbf{Active GCS ENCS user account:} Ensure you have an active GCS ENCS user account with + permission to connect to Speed (see \xs{sect:access-requests}). + \item \textbf{VPN Connection} (for off-campus access): If you are off-campus, you wil need to establish an active connection to Concordia's VPN, + which requires a Concordia netname. + \item \textbf{Terminal Emulator for Windows:} Windows systems use a terminal emulator such as PuTTY, Cygwin, or MobaXterm. + \item \textbf{Terminal for macOS:} macOS systems have a built-in Terminal app or \tool{xterm} that comes with XQuartz. +\end{enumerate} + +\noindent To create an SSH connection to Speed, open a terminal window and type the following command, replacing \verb!! with your ENCS account's username: +\begin{verbatim} + ssh @speed.encs.concordia.ca +\end{verbatim} + +\noindent For detailed instructions on securely connecting to a GCS server, refer to the AITS FAQ: +\href{https://www.concordia.ca/ginacody/aits/support/faq/ssh-to-gcs.html}{How do I securely connect to a GCS server?} + +% 2.1.2 Environment Set Up +% -------------------------- +\subsubsection{Environment Set Up} +\label{sect:envsetup} +%TO BE DELETED +%\input{scheduler-env} +% + +After creating an SSH connection to Speed, you will need to make sure the \tool{srun}, \tool{sbatch}, and \tool{salloc} +commands are available to you. To check this, type each command at the prompt and press Enter. +If ``command not found'' is returned, you need to make sure your \api{\$PATH} includes \texttt{/local/bin}. +You can check your path by typing: +\begin{verbatim} + echo $PATH +\end{verbatim} + +\noindent The next step is to set up your cluster-specific storage ``speed-scratch'', to do so, execute the following command from within your +home directory. +\begin{verbatim} + mkdir -p /speed-scratch/$USER && cd /speed-scratch/$USER +\end{verbatim} + +\noindent Next, copy a job template to your cluster-specific storage +\begin{itemize} + \item From Windows drive G: to Speed:\\ + \verb|cp /winhome/<1st letter of $USER>/$USER/ -
+
@@ -23,158 +23,160 @@

Speed: The GCS ENCS Cluster


Gina Cody School of Engineering and Computer Science
Concordia University
Montreal, Quebec, Canada -
rt-ex-hpc~AT~encs.concordia.ca

-
Version 7.2
-

The group acknowledges the initial manual version VI produced by Dr. Scott Bunnell while with -us as well as Dr. Tariq Daradkeh for his instructional support of the users and contribution of -examples.
-
+
rt-ex-hpc~AT~encs.concordia.ca
+
+
Version 7.3
+

The group acknowledges the initial manual version VI produced by Dr. Scott Bunnell while with us +as well as Dr. Tariq Daradkeh for his instructional support of the users and contribution of +examples.

Abstract

-

This document serves as a quick start guide to using the Gina Cody School of Engineering +

This document serves as a quick start guide to using the Gina Cody School of Engineering and Computer Science (GCS ENCS) compute server farm, known as “Speed.” Managed by the HPC/NAG group of the Academic Information Technology Services (AITS) at GCS, Concordia University, Montreal, Canada.

-

Contents

-
- 1 Introduction -
 1.1 Citing Us -
 1.2 Resources -
 1.3 Team -
 1.4 What Speed Consists of -
 1.5 What Speed Is Ideal For -
 1.6 What Speed Is Not -
 1.7 Available Software -
 1.8 Requesting Access -
2 Job Management -
 2.1 Getting Started -
  2.1.1 SSH Connections -
  2.1.2 Environment Set Up -
 2.2 Job Submission Basics -
  2.2.1 Directives -
  2.2.2 Working with Modules -
  2.2.3 User Scripting -
 2.3 Sample Job Script -
 2.4 Common Job Management Commands Summary - - - -
 2.5 Advanced sbatch Options -
 2.6 Array Jobs -
 2.7 Requesting Multiple Cores (i.e., Multithreading Jobs) -
 2.8 Interactive Jobs -
  2.8.1 Command Line -
  2.8.2 Graphical Applications -
  2.8.3 Jupyter Notebooks -
  2.8.3.1 Jupyter Notebook in Singularity -
  2.8.3.2 JupyterLab in Conda and Pytorch -
  2.8.3.3 JupyterLab + Pytorch in Python venv -
  2.8.4 Visual Studio Code -
 2.9 Scheduler Environment Variables -
 2.10 SSH Keys for MPI -
 2.11 Creating Virtual Environments -
  2.11.1 Anaconda -
  2.11.1.1 Conda Env without --prefix -
  2.11.2 Python -
 2.12 Example Job Script: Fluent -
 2.13 Example Job: EfficientDet -
 2.14 Java Jobs -
 2.15 Scheduling on the GPU Nodes -
  2.15.1 P6 on Multi-GPU, Multi-Node -
  2.15.2 CUDA -
  2.15.3 Special Notes for Sending CUDA Jobs to the GPU Queues -
  2.15.4 OpenISS Examples -
  2.15.4.1 OpenISS and REID -
  2.15.4.2 OpenISS and YOLOv3 -
 2.16 Singularity Containers -
3 Conclusion -
 3.1 Important Limitations -
 3.2 Tips/Tricks -
 3.3 Use Cases -
A History -
 A.1 Acknowledgments -
 A.2 Migration from UGE to SLURM -
 A.3 Phases -
  A.3.1 Phase 5 -
  A.3.2 Phase 4 -
  A.3.3 Phase 3 -
  A.3.4 Phase 2 -
  A.3.5 Phase 1 -
B Frequently Asked Questions -
 B.1 Where do I learn about Linux? -
 B.2 How to use bash shell on Speed? -
  B.2.1 How do I set bash as my login shell? -
  B.2.2 How do I move into a bash shell on Speed? -
  B.2.3 How do I use the bash shell in an interactive session on Speed? -
  B.2.4 How do I run scripts written in bash on Speed? - - - -
 B.3 How to resolve “Disk quota exceeded” errors? -
  B.3.1 Probable Cause -
  B.3.2 Possible Solutions -
  B.3.3 Example of setting working directories for COMSOL -
  B.3.4 Example of setting working directories for Python Modules -
 B.4 How do I check my job’s status? -
 B.5 Why is my job pending when nodes are empty? -
  B.5.1 Disabled nodes -
  B.5.2 Error in job submit request. -
C Sister Facilities -
D Software Installed On Speed -
 D.1 EL7 -
 D.2 EL9 -
Annotated Bibliography -
- - - -

1 Introduction

-

This document contains basic information required to use “Speed”, along with tips, tricks, examples, +

Contents

+
+1 Introduction +
 1.1 Citing Us +
 1.2 Resources +
 1.3 Team +
 1.4 What Speed Consists of +
 1.5 What Speed Is Ideal For +
 1.6 What Speed Is Not +
 1.7 Available Software +
 1.8 Requesting Access +
2 Job Management +
 2.1 Getting Started + + + +
  2.1.1 SSH Connections +
  2.1.2 Environment Set Up +
 2.2 Job Submission Basics +
  2.2.1 Directives +
  2.2.2 Working with Modules +
  2.2.3 User Scripting +
 2.3 Sample Job Script +
 2.4 Common Job Management Commands +
 2.5 Advanced sbatch Options +
 2.6 Array Jobs +
 2.7 Requesting Multiple Cores (i.e., Multithreading Jobs) +
 2.8 Interactive Jobs +
  2.8.1 Command Line +
  2.8.2 Graphical Applications +
  2.8.3 Jupyter Notebooks +
  2.8.3.1 Jupyter Notebook in Singularity +
  2.8.3.2 Jupyter Notebook in Conda +
  2.8.3.3 Jupyter Notebook in Python venv +
  2.8.4 Visual Studio Code +
 2.9 Scheduler Environment Variables +
 2.10 SSH Keys for MPI +
 2.11 Creating Virtual Environments +
  2.11.1 Anaconda +
  2.11.1.1 Conda Env without --prefix +
  2.11.2 Python +
 2.12 Example Job Script: Fluent +
 2.13 Example Job: EfficientDet +
 2.14 Java Jobs +
 2.15 Scheduling on the GPU Nodes +
  2.15.1 P6 on Multi-GPU, Multi-Node +
  2.15.2 CUDA +
  2.15.3 Special Notes for Sending CUDA Jobs to the GPU Queues + + + +
  2.15.4 OpenISS Examples +
  2.15.4.1 OpenISS and REID +
  2.15.4.2 OpenISS and YOLOv3 +
 2.16 Singularity Containers +
3 Conclusion +
 3.1 Important Limitations +
 3.2 Tips/Tricks +
 3.3 Use Cases +
A History +
 A.1 Acknowledgments +
 A.2 Migration from UGE to SLURM +
 A.3 Phases +
  A.3.1 Phase 5 +
  A.3.2 Phase 4 +
  A.3.3 Phase 3 +
  A.3.4 Phase 2 +
  A.3.5 Phase 1 +
B Frequently Asked Questions +
 B.1 Where do I learn about Linux? +
 B.2 How to use bash shell on Speed? +
  B.2.1 How do I set bash as my login shell? +
  B.2.2 How do I move into a bash shell on Speed? +
  B.2.3 How do I use the bash shell in an interactive session on Speed? +
  B.2.4 How do I run scripts written in bash on Speed? +
 B.3 How to resolve “Disk quota exceeded” errors? +
  B.3.1 Probable Cause +
  B.3.2 Possible Solutions +
  B.3.3 Example of setting working directories for COMSOL +
  B.3.4 Example of setting working directories for Python Modules +
 B.4 How do I check my job’s status? +
 B.5 Why is my job pending when nodes are empty? +
  B.5.1 Disabled nodes + + + +
  B.5.2 Error in job submit request. +
C Sister Facilities +
D Software Installed On Speed +
 D.1 EL7 +
 D.2 EL9 +
Annotated Bibliography +
+ + + +

1 Introduction

+

This document contains basic information required to use “Speed”, along with tips, tricks, examples, and references to projects and papers that have used Speed. User contributions of sample jobs and/or -references are welcome.
-

Note: On October 20, 2023, we completed the migration to SLURM from Grid Engine (UGE/AGE) +references are welcome. +

Note: On October 20, 2023, we completed the migration to SLURM from Grid Engine (UGE/AGE) as our job scheduler. This manual has been updated to use SLURM’s syntax and commands. If you are a long-time GE user, refer to Appendix A.2 for key highlights needed to translate your GE jobs to SLURM as well as environment changes. These changes are also elaborated throughout this document and our examples. -

+

-

1.1 Citing Us

-

If you wish to cite this work in your acknowledgements, you can use our general DOI found on our +

1.1 Citing Us

+

If you wish to cite this work in your acknowledgements, you can use our general DOI found on our GitHub page https://dx.doi.org/10.5281/zenodo.5683642 or a specific version of the manual and scripts from that link individually. You can also use the “cite this repository” feature of GitHub. -

+

-

1.2 Resources

+

1.2 Resources

  • Concordia official page for “Speed” cluster, which includes access request instructions. https://www.concordia.ca/ginacody/aits/speed.html
  • -
  • All Speed users are subscribed to the hpc-ml mailing list. -
  • -

    +

  • All Speed users are subscribed to the hpc-ml mailing list.
  • +

    -

    1.3 Team

    -

    Speed is supported by:

    +

    1.3 Team

    +

    Speed is supported by:

    -

    We receive support from the rest of AITS teams, such as NAG, SAG, FIS, and DOG.
    https://www.concordia.ca/ginacody/aits.html -

    +

    We receive support from the rest of AITS teams, such as NAG, SAG, FIS, and DOG. +

    https://www.concordia.ca/ginacody/aits.html +

    -

    1.4 What Speed Consists of

    -
      -
    • Twenty four (24) 32-core compute nodes, each with 512 GB of memory and approximately - 1 TB of local volatile-scratch disk space (pictured in Figure 1). +

      1.4 What Speed Consists of

      +
        +
      • Twenty four (24) 32-core compute nodes, each with 512 GB of memory and approximately + 1 TB of local volatile-scratch disk space (pictured in Figure 1).
      • Twelve (12) NVIDIA Tesla P6 GPUs, with 16 GB of GPU memory (compatible with the CUDA, OpenGL, OpenCL, and Vulkan APIs). @@ -214,7 +217,7 @@

        1
      • Nebular subcluster partition (CIISE, Drs. Yan, Assi, Ghafouri, et al., Nebulae GPU node with 2x RTX 6000 Ada 48GB cards, Stellar compute node, and Matrix 177TB storage/compute node, see Figure 2).
      -
      +
      @@ -223,14 +226,14 @@

      1 -

      PIC +

      PIC

      Figure 1: Speed
      -

      -
      +
      +
      @@ -239,14 +242,14 @@

      1 -

      PIC +

      PIC

      Figure 2: Speed Cluster Hardware Architecture
      -

      -
      +
      +
      @@ -255,21 +258,21 @@

      1 -

      PIC +

      PIC

      Figure 3: Speed SLURM Architecture
      -

      -

      1.5 What Speed Is Ideal For

      +
      +

      1.5 What Speed Is Ideal For

      • Design, develop, test, and run parallel, batch, and other algorithms and scripts with partial data sets. “Speed” has been optimized for compute jobs that are multi-core aware, require a large memory space, or are iteration intensive.
      • -

        Prepare jobs for large clusters such as:

        +

        Prepare jobs for large clusters such as:

        • Digital Research Alliance of Canada (Calcul Quebec and Compute Canada)
        • @@ -286,16 +289,16 @@

          approximately 10 TB.
        • CPU-based jobs. -
        • -
        • CUDA GPU jobs. +
        • +
        • CUDA GPU jobs.
        • Non-CUDA GPU jobs using OpenCL.
        -

        +

        -

        1.6 What Speed Is Not

        +

        1.6 What Speed Is Not

        • Speed is not a web host and does not host websites.
        • @@ -309,35 +312,35 @@

          1.6
        • Speed is not for jobs executed outside of the scheduler. (Jobs running outside of the scheduler will be killed and all data lost.)
        -

        +

        -

        1.7 Available Software

        -

        There are a wide range of open-source and commercial software available and installed on “Speed.” +

        1.7 Available Software

        +

        There are a wide range of open-source and commercial software available and installed on “Speed.” This includes Abaqus [1], AllenNLP, Anaconda, ANSYS, Bazel, COMSOL, CPLEX, CUDA, Eclipse, -Fluent [2], Gurobi, MATLAB [1530], OMNeT++, OpenCV, OpenFOAM, OpenMPI, OpenPMIx, -ParaView, PyTorch, QEMU, R, Rust, and Singularity among others. Programming environments -include various versions of Python, C++/Java compilers, TensorFlow, OpenGL, OpenISS, and -MARF [31].
        +Fluent [2], Gurobi, MATLAB [1529], OMNeT++, OpenCV, OpenFOAM, OpenMPI, OpenPMIx, -

        In particular, there are over 2200 programs available in /encs/bin and /encs/pkg under Scientific +ParaView, PyTorch, QEMU, R, Rust, and Singularity among others. Programming environments +include various versions of Python, C++/Java compilers, TensorFlow, OpenGL, OpenISS, and +MARF [30]. +

        In particular, there are over 2200 programs available in /encs/bin and /encs/pkg under Scientific Linux 7 (EL7). We are building an equivalent array of programs for the EL9 SPEED2 nodes. To see the packages available, run ls -al /encs/pkg/ on speed.encs. See a complete list in -Appendix D.
        -

        Note: We do our best to accommodate custom software requests. Python environments can use -user-custom installs from within the scratch directory. -

        +Appendix D. +

        Note: We do our best to accommodate custom software requests. Python environments can use +user-custom installs from within scratch directory. +

        -

        1.8 Requesting Access

        -

        After reviewing the “What Speed is” (Section 1.5) and “What Speed is Not” (Section 1.6), request +

        1.8 Requesting Access

        +

        After reviewing the “What Speed is” (Section 1.5) and “What Speed is Not” (Section 1.6), request access to the “Speed” cluster by emailing: rt-ex-hpc AT encs.concordia.ca.

        • GCS ENCS faculty and staff may request access directly.
        • -

          GCS students must include the following in their request message:

          +

          GCS students must include the following in their request message:

          • GCS ENCS username
          • @@ -359,61 +362,64 @@

            1.8
          • External entities collaborating with GCS Concordia researchers should also go through the Dean’s Office for approvals.
          -

          +

          For detailed instructions, refer to the Concordia Computing (HPC) Facility: Speed webpage. +

          -

          2 Job Management

          -

          We use SLURM as the workload manager. It supports primarily two types of jobs: batch and +

          2 Job Management

          +

          We use SLURM as the workload manager. It supports primarily two types of jobs: batch and interactive. Batch jobs are used to run unattended tasks, whereas, interactive jobs are are ideal for -setting up virtual environments, compilation, and debugging.
          -

          Note: In the following instructions, anything bracketed like, <>, indicates a label/value to be replaced -(the entire bracketed term needs replacement).
          -

          Job instructions in a script start with #SBATCH prefix, for example: +setting up virtual environments, compilation, and debugging. +

          Note: In the following instructions, anything bracketed like, <>, indicates a label/value to be replaced +(the entire bracketed term needs replacement). +

          Job instructions in a script start with #SBATCH prefix, for example:

          -
          +
               #SBATCH --mem=100M -t 600 -J <job-name> -A <slurm account>
          -    #SBATCH -p pg --gpus=2 --mail-type=ALL
          -
          -

          For complex compute steps within a script, use srun. We recommend using salloc for interactive +    #SBATCH -p pg --gpus=1 --mail-type=ALL

          +

          +

          For complex compute steps within a script, use srun. We recommend using salloc for interactive jobs as it supports multiple steps. However, srun can also be used to start interactive jobs (see Section 2.8). Common and required job parameters include: -

          -
          +

          Common and required job parameters include:

            -
          • memory (--mem), +
          • Memory (--mem=<mem>[M|G|T]),
          • -
          • time (-t), +
          • Partition/Queue (-p <partition>),
          • -
          • --job-name (-J), +
          • Job name (--job-name=<name> or -J <name>),
          • -
          • slurm project account (-A), +
          • Wall Clock Limit (-t <min> or -t <days-hh:mm:ss>),
          • -
          • partition (-p), +
          • Event Notification (--mail-type=<events>),
          • -
          • mail type (--mail-type), +
          • Email Address (--mail-user=<address>),
          • -
          • ntasks (-n), +
          • Slurm Account (--account=<account> or -A <account>),
          • -
          • CPUs per task (--cpus-per-task).
          -
          +
        • Tasks Per Node (--tasks-per-node=<count>), -

          +

        • +
        • CPUs Per Task (--cpus-per-task=<count>), +
        • +
        • CPU Count ntasks (-n <count>).
        +

        -

        2.1 Getting Started

        -

        Before getting started, please review the “What Speed is” (Section 1.5) and “What Speed is Not” +

        2.1 Getting Started

        +

        Before getting started, please review the “What Speed is” (Section 1.5) and “What Speed is Not” (Section 1.6). Once your GCS ENCS account has been granted access to “Speed”, use your GCS ENCS account credentials to create an SSH connection to speed (an alias for -speed-submit.encs.concordia.ca).
        -

        All users are expected to have a basic understanding of Linux and its commonly used commands -(see Appendix B for resources). -

        +speed-submit.encs.concordia.ca). +

        All users are expected to have a basic understanding of Linux and its commonly used commands (see +Appendix B for resources). +

        -
        2.1.1 SSH Connections
        -

        Requirements to create connections to “Speed”: +

        2.1.1 SSH Connections
        +

        Requirements to create SSH connection to “Speed”:

        1. Active GCS ENCS user account: Ensure you have an active GCS ENCS user account with permission to connect to Speed (see Section 1.8). @@ -423,100 +429,98 @@
          2.1.1
        2. Terminal Emulator for Windows: Windows systems use a terminal emulator such as PuTTY, Cygwin, or MobaXterm. + + +
        3. Terminal for macOS: macOS systems have a built-in Terminal app or xterm that comes with XQuartz.
        -

        To create an SSH connection to Speed, open a terminal window and type the following command, +

        To create an SSH connection to Speed, open a terminal window and type the following command, replacing <ENCSusername> with your ENCS account’s username:

        -
        -    ssh <ENCSusername>@speed.encs.concordia.ca
        -
        -

        -

        For detailed instructions on securely connecting to a GCS server, refer to the AITS FAQ: How do I +

        +    ssh <ENCSusername>@speed.encs.concordia.ca
        +

        +

        For detailed instructions on securely connecting to a GCS server, refer to the AITS FAQ: How do I securely connect to a GCS server? -

        +

        -
        2.1.2 Environment Set Up
        -

        After creating an SSH connection to Speed, you will need to make sure the srun, sbatch, and salloc +

        2.1.2 Environment Set Up
        +

        After creating an SSH connection to Speed, you will need to make sure the srun, sbatch, and salloc commands are available to you. To check this, type each command at the prompt and press Enter. If “command not found” is returned, you need to make sure your $PATH includes /local/bin. You can -check your $PATH by typing: +check your path by typing:

        -
        -    echo $PATH
        -
        -

        -

        The next step is to set up your cluster-specific storage “speed-scratch”, to do so, execute the following +

        +    echo $PATH
        +

        +

        The next step is to set up your cluster-specific storage “speed-scratch”, to do so, execute the following command from within your home directory.

        -
        -    mkdir -p /speed-scratch/$USER && cd /speed-scratch/$USER
        -
        -

        -

        Next, copy a job template to your cluster-specific storage

        +
        +    mkdir -p /speed-scratch/$USER && cd /speed-scratch/$USER
        +

        +

        Next, copy a job template to your cluster-specific storage

          -
        • From Windows drive G: to Speed:
          cp /winhome/<1st letter of $USER>/$USER/example.sh /speed-scratch/$USER/ +
        • From Windows drive G: to Speed:
          cp /winhome/<1st letter of $USER>/$USER/<script>.sh /speed-scratch/$USER/
        • -
        • From Linux drive U: to Speed:
          cp ~/example.sh /speed-scratch/$USER/
        -

        Tip: the default shell for GCS ENCS users is tcsh. If you would like to use bash, please contact -rt-ex-hpc AT encs.concordia.ca.
        -

        Note: If you encounter a “command not found” error after logging in to Speed, your user account +

      • From Linux drive U: to Speed:
        cp ~/<script>.sh /speed-scratch/$USER/
      +

      Tip: the default shell for GCS ENCS users is tcsh. If you would like to use bash, please contact +rt-ex-hpc AT encs.concordia.ca. +

      Note: If you encounter a “command not found” error after logging in to Speed, your user account may have defunct Grid Engine environment commands. See Appendix A.2 for instructions on how to resolve this issue. -

      +

      -

      2.2 Job Submission Basics

      -

      Preparing your job for submission is fairly straightforward. Start by basing your job script on one of +

      2.2 Job Submission Basics

      +

      Preparing your job for submission is fairly straightforward. Start by basing your job script on one of the examples available in the src/ directory of our GitHub repository. You can clone the repository to get the examples to start with via the command line:

      -
      +
           git clone --depth=1 https://github.com/NAG-DevOps/speed-hpc.git
      -    cd speed-hpc/src
      -
      -

      -

      The job script is a shell script that contains directives, module loads, and user scripting. To quickly +    cd speed-hpc/src

      +

      +

      The job script is a shell script that contains directives, module loads, and user scripting. To quickly run some sample jobs, use the following commands:

      -
      +
           sbatch -p ps -t 10 env.sh
           sbatch -p ps -t 10 bash.sh
           sbatch -p ps -t 10 manual.sh
      -    sbatch -p pg -t 10 lambdal-singularity.sh
      -
      -

      -

      +    sbatch -p pg -t 10 lambdal-singularity.sh

      +

      +

      -
      2.2.1 Directives
      +
      2.2.1 Directives

      Directives are comments included at the beginning of a job script that set the shell and the options for the job scheduler. The shebang directive is always the first line of a script. In your job script, this directive sets which shell your script’s commands will run in. On “Speed”, we recommend that your script use a shell from the /encs/bin directory.
      -

      To use the tcsh shell, start your script with #!/encs/bin/tcsh. For bash, start with +

      To use the tcsh shell, start your script with #!/encs/bin/tcsh. For bash, start with #!/encs/bin/bash.
      -

      Directives that start with #SBATCH set the options for the cluster’s SLURM job scheduler. The +

      Directives that start with #SBATCH set the options for the cluster’s SLURM job scheduler. The following provides an example of some essential directives:

      -
      +
           #SBATCH --job-name=<jobname>        ## or -J. Give the job a name
           #SBATCH --mail-type=<type>          ## set type of email notifications
           #SBATCH --chdir=<directory>         ## or -D, set working directory for the job
      @@ -524,8 +528,7 @@ 
      2.2.1 +                                        ## e.g., 32G memory per node

      Replace the following to adjust the job script for your project(s)

        @@ -548,15 +551,14 @@
        2.2.1 -
        +
             #SBATCH -J myjob              ## Job’s name set to ’myjob’
             #SBATCH --mail-type=ALL       ## Receive all email type notifications
             #SBATCH -D ./                 ## Use current directory as working directory
             #SBATCH -N 1                  ## Node count required for the job
             #SBATCH -n 1                  ## Number of tasks to be launched
             #SBATCH -c 8                  ## Request 8 cores
        -    #SBATCH --mem=32G             ## Allocate 32G memory per node
        -
        +    #SBATCH --mem=32G             ## Allocate 32G memory per node

        Tip: If you are unsure about memory footprints, err on assigning a generous memory space to your job, so that it does not get prematurely terminated. You can refine --mem values @@ -566,20 +568,18 @@

        2.2.1 -
        +
             sacct -j <jobID>
        -    sstat -j <jobID>
        -
        +    sstat -j <jobID>

        This can be customized to show specific columns:

        -
        +
             sacct -o jobid,maxvmsize,ntasks%7,tresusageouttot%25 -j <jobID>
        -    sstat -o jobid,maxvmsize,ntasks%7,tresusageouttot%25 -j <jobID>
        -
        +    sstat -o jobid,maxvmsize,ntasks%7,tresusageouttot%25 -j <jobID>

        Memory-footprint efficiency values (seff) are also provided for completed jobs in the final email notification as “maxvmsize”. Jobs that request a low-memory footprint are more likely to load on a @@ -596,296 +596,405 @@

        2.2.1 speed1 and speed2. However, users that belong to a particular research group or project are will have a default Account like the following aits, vidpro, gipsy, ai2, mpackir, cmos, among others.
      -

      +

      Directives are comments included at the beginning of a job script that set the shell and the options for +the job scheduler. +

      The shebang directive is always the first line of a script. In your job script, this directive sets which +shell your script’s commands will run in. On “Speed”, we recommend that your script use a shell from +the /encs/bin directory. +

      To use tcsh shell, start your script with #!/encs/bin/tcsh, for bash, start with #!/encs/bin/bash +

      Directives that start with #SBATCH set the options for the cluster’s SLURM job scheduler. The +following provides an example of some essential directives: + + +

      -
      2.2.2 Working with Modules
      -

      After setting the directives in your job script, the next section typically involves loading the necessary +

      +    #SBATCH --job-name=<jobname>        ## or -J. Give the job a name
      +    #SBATCH --mail-type=<type>          ## set type of email notifications
      +    #SBATCH --chdir=<directory>         ## or -D, set working directory for the job
      +    #SBATCH --nodes=1                   ## or -N, node count required for the job
      +    #SBATCH --ntasks=1                  ## or -n, number of tasks to be launched
      +    #SBATCH --cpus-per-task=<corecount> ## or -c, core count requested, e.g. 8 cores
      +    #SBATCH --mem=<memory>              ## assign memory for this job,
      +                                        ## e.g., 32G memory per node
      +

      +

      Replace the following to adjust the job script for your project(s)

      +
        +
      • <jobname> with a job name for the job. This name will be displayed in the job queue. +
      • +
      • <directory> with the fullpath to your job’s working directory, e.g., where your code, + source files and where the standard output files will be written to. By default, --chdir + sets the current directory as the job’s working directory. +
      • +
      • <type> with the type of e-mail notifications you wish to receive. Valid options are: NONE, + BEGIN, END, FAIL, REQUEUE, ALL. +
      • +
      • <corecount> with the degree of multithreaded parallelism (i.e., cores) allocated to your + job. Up to 32 by default. +
      • +
      • <memory> with the amount of memory, in GB, that you want to be allocated per node. + Up to 500 depending on the node.
        Note: All jobs MUST set a value for the --mem option.
      +

      Example with short option equivalents: + + + +

      +
      +    #SBATCH -J myjob              ## Job’s name set to ’myjob’
      +    #SBATCH --mail-type=ALL       ## Receive all email type notifications
      +    #SBATCH -D ./                 ## Use current directory as working directory
      +    #SBATCH -N 1                  ## Node count required for the job
      +    #SBATCH -n 1                  ## Number of tasks to be launched
      +    #SBATCH -c 8                  ## Request 8 cores
      +    #SBATCH --mem=32G             ## Allocate 32G memory per node
      +

      +

      Tip: If you are unsure about memory footprints, err on assigning a generous memory space to +your job, so that it does not get prematurely terminated. You can refine --mem values +for future jobs by monitoring the size of a job’s active memory space on speed-submit +with: + + + +

      +
      +    sacct -j <jobID>
      +    sstat -j <jobID>
      +

      +

      This can be customized to show specific columns: + + + +

      +
      +    sacct -o jobid,maxvmsize,ntasks%7,tresusageouttot%25 -j <jobID>
      +    sstat -o jobid,maxvmsize,ntasks%7,tresusageouttot%25 -j <jobID>
      +

      +

      Memory-footprint efficiency values seff are also provided for completed jobs in the final email +notification as “maxvmsize”. +

      Jobs that request a low-memory footprint are more likely to load on a busy cluster. +

      Other essential options are --time, or -t, and --account, or -A.

      +
        +
      • --time=<time> – is the estimate of wall clock time required for your job to run. As + previously mentioned, the maximum is 7 days for batch and 24 hours for interactive jobs. + Jobs with a smaller time value will have a higher priority and may result in your job + being scheduled sooner. +
      • +
      • --account=<name> – specifies which Account, aka project or association, that the Speed + resources your job uses should be attributed to. When moving from GE to SLURM users + most users were assigned to Speed’s two default accounts speed1 and speed2. However, + users that belong to a particular research group or project are will have a default Account + like the following aits, vidpro, gipsy, ai2, mpackir, cmos, among others.
      +

      +

      +
      2.2.2 Working with Modules
      +

      After setting the directives in your job script, the next section typically involves loading the necessary software modules. The module command is used to manage the user environment, make sure to load all the modules your job depends on. You can check available modules with the module avail command. Loading the correct modules ensures that your environment is properly set up for -execution.
      -

      To list for a particular program (matlab, for example): +execution. +

      To list for a particular program (matlab, for example):

      -
      +
           module avail
           module -t avail matlab  ## show the list for a particular program (e.g., matlab)
      -    module -t avail m       ## show the list for all programs starting with m
      -
      -

      -

      For example, insert the following in your script to load the matlab/R2023a module: +    module -t avail m       ## show the list for all programs starting with ‘m’

      +

      +

      For example, insert the following in your script to load the matlab/R2023a module:

      -
      -    module load matlab/R2023a/default
      -
      -

      -

      Note: you can remove a module from active use by replacing load by unload.
      -

      To list loaded modules: +

      +    module load matlab/R2023a/default
      +

      +

      Note: you can remove a module from active use by replacing load by unload. +

      To list loaded modules:

      -
      -    module list
      -
      -

      -

      To purge all software in your working environment: +

      +    module list
      +

      +

      To purge all software in your working environment:

      -
      -    module purge
      -
      -

      -

      +

      +    module purge
      +

      +

      -
      2.2.3 User Scripting
      -

      The final part of the job script involves the commands that will be executed by the job. This section +

      2.2.3 User Scripting
      +

      The final part of the job script involves the commands that will be executed by the job. This section should include all necessary commands to set up and run the tasks your script is designed to perform. You can use any Linux command in this section, ranging from a simple executable call to a complex -loop iterating through multiple commands.
      -

      Best Practice: prefix any compute-heavy step with srun. This ensures you gain proper insights on -the execution of your job.
      -

      Each software program may have its own execution framework, as it’s the script’s author (e.g., you) +loop iterating through multiple commands. +

      Best Practice: prefix any compute-heavy step with srun. This ensures you gain proper insights on +the execution of your job. +

      Each software program may have its own execution framework, as it’s the script’s author (e.g., you) responsibility to review the software’s documentation to understand its requirements. Your script should be written to clearly specify the location of input and output files and the degree of parallelism -needed.
      -

      Jobs that involve multiple interactions with data input and output files, should make use of TMPDIR, a +needed. +

      Jobs that involve multiple interactions with data input and output files, should make use of TMPDIR, a scheduler-provided workspace nearly 1 TB in size. TMPDIR is created on the local disk of the compute node at the start of a job, offering faster I/O operations compared to shared storage (provided over NFS). -

      An sample job script using TMPDIR is available at /home/n/nul-uge/templateTMPDIR.sh: the job -is instructed to change to $TMPDIR, to make the new directory input, to copy data from -$SLURM_SUBMIT_DIR/references/ to input/ ($SLURM_SUBMIT_DIR represents the current working -directory), to make the new directory results, to execute the program (which takes input from -$TMPDIR/input/ and writes output to $TMPDIR/results/), and finally to copy the total end results -to an existing directory, processed, that is located in the current working directory. TMPDIR only -exists for the duration of the job, though, so it is very important to copy relevant results from it at -job’s end. -

      -

      -

      2.3 Sample Job Script

      -

      Here’s a basic job script, tcsh.sh shown in Figure 4. You can copy it from our GitHub +

      +

      +

      2.3 Sample Job Script

      +

      Here’s a basic job script, env.sh shown in Figure 4. You can copy it from our GitHub repository.

      -
      +
      - + -
      #!/encs/bin/tcsh 
      +
      #!/encs/bin/tcsh 
        
      -#SBATCH --job-name=tcsh-test 
      -#SBATCH --mem=1G 
      - 
      -sleep 30 
      -module load gurobi/8.1.0 
      -module list
      +#SBATCH --job-name=envs        ## Give the job a name 
      +#SBATCH --mail-type=ALL        ## Receive all email type notifications 
      +#SBATCH --chdir=./             ## Use currect directory as working directory 
      +#SBATCH --nodes=1 
      +#SBATCH --ntasks=1 
      +#SBATCH --cpus-per-task=1      ## Request 1 cpus 
      +#SBATCH --mem=1G               ## Assign 1G memory per node 
      + 
      +# Reset TMPDIR to a larger storage 
      +mkdir -p /speed-scratch/$USER/tmp 
      +setenv TMPDIR /speed-scratch/$USER/tmp 
      + 
      +date 
      +srun env 
      +date 
      + 
      +# EOF
       
      -
      Figure 4: Source code for tcsh.sh
      +
      Figure 4: Source code for env.sh
      -
      -

      The first line is the shell declaration (also know as a shebang) and sets the shell to tcsh. The lines that -begin with #SBATCH are directives for the scheduler.

      +
      +

      The first line is the shell declaration (also know as a shebang) and sets the shell to tcsh. The lines that +begin with #SBATCH are directives for the scheduler. +

        -
      • -J (or --job-name) sets tcsh-test as the job name. +
      • -J (or --job-name) sets envs as the job name. +
      • +
      • --mail-type sets the type of notifications. +
      • +
      • --chdir sets the working directory. +
      • +
      • --nodes specifies the number of required nodes.
      • -
      • --mem=1GB requests and assigns 1GB of memory to the job. Jobs require the --mem option - to be set either in the script or on the command line; if it’s missing, job submission - will be rejected.
      -

      The script then: +

    • --ntasks specifies the number of tasks. +
    • +
    • --cpus-per-task requests 1 cpus. +
    • +
    • +

      --mem= requests memory. +

      Note: Jobs require the --mem option to be set either in the script or on the command + line; job submission will be rejected if it’s missing.

    +

    The script then:

      -
    1. Sleeps on a node for 30 seconds. +
    2. Creates a directory.
    3. -
    4. Uses the module command to load the gurobi/8.1.0 environment. +
    5. Sets TMPDIR to a larger storage. + + + +
    6. +
    7. Prints current date. +
    8. +
    9. Prints env variables.
    10. -
    11. Prints the list of loaded modules into a file.
    -

    The scheduler command, sbatch, is used to submit (non-interactive) jobs. From an ssh session on +

  • Prints current date again.
  • +

    The scheduler command, sbatch, is used to submit (non-interactive) jobs. From an ssh session on “speed-submit”, submit this job with

    -
    -    sbatch ./tcsh.sh
    -
    -

    -

    You will see, Submitted batch job 2653 where \(2653\) is a job ID assigned. The commands squeue and -sinfo can be used to look at the status of the cluster: +

    +    sbatch ./env.sh
    +

    +

    You will see, Submitted batch job <JOB ID>. The commands squeue and sinfo can be used to +look at the queue and the status of the cluster

    -
    +
     [serguei@speed-submit src] % squeue -l
     Thu Oct 19 11:38:54 2023
     JOBID PARTITION     NAME     USER    STATE       TIME TIME_LIMI  NODES NODELIST(REASON)
    - 2641        ps interact   b_user  RUNNING   19:16:09 1-00:00:00      1 speed-07
    - 2652        ps interact   a_user  RUNNING      41:40 1-00:00:00      1 speed-07
    - 2654        ps tcsh-tes  serguei  RUNNING       0:01 7-00:00:00      1 speed-07
    + 2641        ps interact   b_user   RUNNING   19:16:09 1-00:00:00      1 speed-07
    + 2652        ps interact   a_user   RUNNING      41:40 1-00:00:00      1 speed-07
    + 2654        ps envs       serguei  RUNNING       0:01 7-00:00:00      1 speed-07
    +
     [serguei@speed-submit src] % sinfo
     PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
    -ps*          up 7-00:00:00     14  drain speed-[08-10,12,15-16,20-22,30-32,35-36]
    -ps*          up 7-00:00:00      1    mix speed-07
    -ps*          up 7-00:00:00      7   idle speed-[11,19,23-24,29,33-34]
    -pg           up 1-00:00:00      1  drain speed-17
    -pg           up 1-00:00:00      3   idle speed-[05,25,27]
    -pt           up 7-00:00:00      7   idle speed-[37-43]
    -pa           up 7-00:00:00      4   idle speed-[01,03,25,27]
    -
    -

    -

    Remember that you only have 30 seconds before the job is essentially over, so if you do not see a -similar output, either adjust the sleep time in the script, or execute the squeue statement more -quickly. The squeue output listed above shows that your job 2654 is running on node speed-07, and -its time limit is 7 days, etc.
    -

    Once the job finishes, there will be a new file in the directory that the job was started from, -with the syntax of, slurm-<job id>.out, so in this example the file is, slurm-2654.out. -This file represents the standard output (and error, if there is any) of the job in question. -If you look at the contents of your newly created file, you will see that it contains the -output of the, module list command. Important information is often written to this +ps*          up 7-00:00:00      8  drng@ speed-[09-11,15-16,20-21,36] +ps*          up 7-00:00:00      3   drng speed-[38,42-43] +ps*          up 7-00:00:00      2  drain magic-node-[04,08] +ps*          up 7-00:00:00      4    mix magic-node-07,salus,speed-[07,37] +ps*          up 7-00:00:00      7  alloc magic-node-06,speed-[08,12,22-24,29] +ps*          up 7-00:00:00     13   idle magic-node-[05,09-10],speed-[19,30-35,39-41] +pg           up 7-00:00:00      1 drain* speed-05 +pg           up 7-00:00:00      2    mix speed-[01,17] +pt           up 7-00:00:00      4   drng speed-[27,38,42-43] +pt           up 7-00:00:00      2    mix speed-[17,37] +pt           up 7-00:00:00      3   idle speed-[39-41] +pa           up 7-00:00:00      1   drng speed-27 +pa           up 7-00:00:00      1    mix speed-01 +pa           up 7-00:00:00      2   idle speed-[03,25] +cl           up 7-00:00:00      1 drain* speed-05 +cl           up 7-00:00:00      4   drng speed-[27,38,42-43] +cl           up 7-00:00:00      3    mix speed-[01,17,37] +cl           up 7-00:00:00      6   idle speed-[03,19,25,39-41] +pc           up 7-00:00:00      1    mix salus +pm           up 7-00:00:00      2  drain magic-node-[04,08] +pm           up 7-00:00:00      1    mix magic-node-07 +pm           up 7-00:00:00      1  alloc magic-node-06 +pm           up 7-00:00:00      3   idle magic-node-[05,09-10] +pn           up 7-00:00:00      1  down* stellar +pn           up 7-00:00:00      2   idle matrix,nebulae

    +

    +

    Once the job finishes, there will be a new file in the directory that the job was started from, +with the syntax of, slurm-<job id>.out. This file represents the standard output (and +error, if there is any) of the job in question. Important information is often written to this file.

    -

    2.4 Common Job Management Commands Summary

    -

    Here is a summary of useful job management commands for handling various aspects of job +

    2.4 Common Job Management Commands

    +

    Here is a summary of useful job management commands for handling various aspects of job + + + submission and monitoring on the Speed cluster:

    • -

      Submitting a job: +

      Submitting a job:

      -
      -     sbatch -A <ACCOUNT> -t <MINUTES> --mem=<MEMORY> -p <PARTITION> ./<myscript>.sh
      -
      -

      +

      +     sbatch -A <ACCOUNT> --mem=<MEMORY> -p <PARTITION> ./<myscript>.sh
      +

    • -

      Checking your job(s) status: +

      Checking your job(s) status:

      -
      -     squeue -u <ENCSusername>
      -
      -

      +

      +     squeue -u <ENCSusername>
      +

    • -

      Displaying cluster status: +

      Displaying cluster status:

      -
      -     squeue
      -
      -

      +
      +     squeue
      +

        -
      • Use -A for per account (e.g., -A vidpro, -A aits), +
      • Use -A for per account (e.g., -A vidpro, -A aits)
      • Use -p for per partition (e.g., -p ps, -p pg, -p pt), etc.
    • -

      Displaying job information: +

      Displaying job information:

      -
      -     squeue --job <job-ID>
      -
      -

      +

      +     squeue --job <job-ID>
      +

    • -

      Displaying individual job steps: (to see which step failed if you used srun) +

      Displaying individual job steps: (to see which step failed if you used srun)

      -
      -     squeue -las
      -
      -

      +

      +     squeue -las
      +

    • -

      Monitoring job and cluster status: (view sinfo and watch the queue for your job(s)) +

      Monitoring job and cluster status: (view sinfo and watch the queue for your job(s))

      -
      -     watch -n 1 "sinfo -Nel -pps,pt,pg,pa && squeue -la"
      -
      -

      +

      +     watch -n 1 "sinfo -Nel -pps,pt,pg,pa && squeue -la"
      +

    • -

      Canceling a job: +

      Canceling a job:

      -
      -     scancel <job-ID>
      -
      -

      +

      +     scancel <job-ID>
      +

    • -

      Holding a job: +

      Holding a job:

      -
      -     scontrol hold <job-ID>
      -
      -

      +

      +     scontrol hold <job-ID>
      +

    • -

      Releasing a job: +

      Releasing a job:

      -
      -     scontrol release <job-ID>
      -
      -

      +

      +     scontrol release <job-ID>
      +

    • -

      Getting job statistics: (including useful metrics like “maxvmem”) +

      Getting job statistics: (including useful metrics like “maxvmem”)

      -
      -     sacct -j <job-ID>
      -
      -

      -

      maxvmem is one of the more useful stats that you can elect to display as a format +

      +     sacct -j <job-ID>
      +

      maxvmem is one of the more useful stats that you can elect to display as a format option.

      -
      +     
            % sacct -j 2654
            JobID           JobName  Partition    Account  AllocCPUS      State ExitCode
            ------------ ---------- ---------- ---------- ---------- ---------- --------
      -     2654          tcsh-test         ps     speed1          1  COMPLETED      0:0
      +     2654               envs         ps     speed1          1  COMPLETED      0:0
            2654.batch        batch                speed1          1  COMPLETED      0:0
            2654.extern      extern                speed1          1  COMPLETED      0:0
            % sacct -j 2654 -o jobid,user,account,MaxVMSize,Reason%10,TRESUsageOutMax%30
      @@ -893,24 +1002,21 @@ 

      -

      -

      See man sacct or sacct -e for details of the available formatting options. You can define your + 2654.extern                speed1    296312K              energy=0,fs/disk=343

      +

      +

      See man sacct or sacct -e for details of the available formatting options. You can define your preferred default format in the SACCT_FORMAT environment variable in your .cshrc or .bashrc files.

    • -

      Displaying job efficiency: (including CPU and memory utilization) +

      Displaying job efficiency: (including CPU and memory utilization)

      -
      -     seff <job-ID>
      -
      -

      -

      Don’t execute it on RUNNING jobs (only on completed/finished jobs), else efficiency statistics +

      +         seff <job-ID>
      +

      Don’t execute it on RUNNING jobs (only on completed/finished jobs), else efficiency statistics may be misleading. If you define the following directive in your batch script, your GCS ENCS email address will receive an email with seff’s output when your job is finished. @@ -918,215 +1024,203 @@

      -
      -     #SBATCH --mail-type=ALL
      -
      -

      -

      Output example: +

      +         #SBATCH --mail-type=ALL
      +

      +

      Output example:

      -
      -     Job ID: XXXXX
      -     Cluster: speed
      -     User/Group: user1/user1
      -     State: COMPLETED (exit code 0)
      -     Nodes: 1
      -     Cores per node: 4
      -     CPU Utilized: 00:04:29
      -     CPU Efficiency: 0.35% of 21:32:20 core-walltime
      -     Job Wall-clock time: 05:23:05
      -     Memory Utilized: 2.90 GB
      -     Memory Efficiency: 2.90% of 100.00 GB
      -
      -

    -

    +

    +         Job ID: XXXXX
    +         Cluster: speed
    +         User/Group: user1/user1
    +         State: COMPLETED (exit code 0)
    +         Nodes: 1
    +         Cores per node: 4
    +         CPU Utilized: 00:04:29
    +         CPU Efficiency: 0.35% of 21:32:20 core-walltime
    +         Job Wall-clock time: 05:23:05
    +         Memory Utilized: 2.90 GB
    +         Memory Efficiency: 2.90% of 100.00 GB
    +

    +

    -

    2.5 Advanced sbatch Options

    -

    In addition to the basic sbatch options presented earlier, there are several advanced options that are +

    2.5 Advanced sbatch Options

    +

    In addition to the basic sbatch options presented earlier, there are several advanced options that are generally useful:

    • -

      E-mail notifications: - - - -

      -
      -     --mail-type=<TYPE>
      -
      -

      Requests the scheduler to send an email when the job changes state. <TYPE> can be ALL, BEGIN, - END, or FAIL. Mail is sent to the default address of, +

      E-mail notifications: requests the scheduler to send an email when the job changes + state.

      -
      -     <ENCSusername>@encs.concordia.ca
      -
      -

      which you can consult via webmail.encs.concordia.ca (use VPN from off-campus) unless a +

      +     --mail-type=<TYPE>
      +

      <TYPE> can be ALL, BEGIN, END, or FAIL. +

      Mail is sent to the default address of <ENCSusername>@encs.concordia.ca +

      Which you can consult via webmail.encs.concordia.ca (use VPN from off-campus) unless a different address is supplied (see, --mail-user). The report sent when a job ends includes job runtime, as well as the maximum memory value hit (maxvmem). +

      To specify a different email address for notifications rather than the default, use

      -
      -     --mail-user email@domain.com
      -
      -

      Specifies a different email address for notifications rather than the default. +

      +     --mail-user email@domain.com
      +

    • -

      Export environment variables used by the script.: +

      Export environment variables used by the script:

      -
      +     
            --export=ALL
            --export=NONE
      -     --export=VARIABLES
      -
      -

      + --export=VARIABLES

      +

    • -

      Job runtime: +

      Job runtime: sets a job runtime of min or HH:MM:SS. Note that if you give a single number, + that represents minutes, not hours. The set runtime should not exceed the default maximums of + 24h for interactive jobs and 7 days for batch jobs.

      -
      -     -t <MINUTES> or DAYS-HH:MM:SS
      -
      -

      sets a job runtime of min or HH:MM:SS. Note that if you give a single number, that represents - minutes, not hours. The set runtime should not exceed the default maximums of 24h for - interactive jobs and 7 days for batch jobs. +

      +     -t <MINUTES> or DAYS-HH:MM:SS
      +

    • -

      Job Dependencies: +

      Job Dependencies: Runs the job only when the specified job <job-ID> finishes. This is useful for + creating job chains where subsequent jobs depend on the completion of previous + ones.

      -
      -     --depend=<state:job-ID>
      -
      -

      Runs the job only when the specified job <job-ID> finishes. This is useful for creating job - chains where subsequent jobs depend on the completion of previous ones.

    -

    Note: sbatch options can be specified during the job-submission command, and these override +

    +     --depend=<state:job-ID>
    +

    +

    Note: sbatch options can be specified during the job-submission command, and these override existing script options (if present). The syntax is

    -
    -sbatch [options] PATHTOSCRIPT
    -
    -

    but unlike in the script, the options are specified without the leading #SBATCH e.g.: +

    +sbatch [options] path/to/script
    +

    but unlike in the script, the options are specified without the leading #SBATCH e.g.:

    -
    -sbatch -J sub-test --chdir=./ --mem=1G ./tcsh.sh
    -
    -

    -

    +

    +sbatch -J sub-test --chdir=./ --mem=1G ./env.sh
    +

    +

    -

    2.6 Array Jobs

    -

    Array jobs are those that start a batch job or a parallel job multiple times. Each iteration of the job +

    2.6 Array Jobs

    +

    Array jobs are those that start a batch job or a parallel job multiple times. Each iteration of the job array is called a task and receives a unique job ID. Array jobs are particularly useful for running a -large number of similar tasks with slight variations.
    -

    To submit an array job (Only supported for batch jobs), use the --array option of the sbatch +large number of similar tasks with slight variations. +

    To submit an array job (Only supported for batch jobs), use the --array option of the sbatch command as follows:

    -
    -sbatch --array=n-m[:s]] <batch_script>
    -
    -

    -

    where

    +
    +sbatch --array=n-m[:s] <script>
    +

    +

    where

    • n: indicates the start-id.
    • m: indicates the max-id.
    • s: indicates the step size.
    -

    Examples:

    +

    Examples:

    • -

      Submit a job with 1 task where the task-id is 10. +

      Submit a job with 1 task where the task-id is 10.

      -
      -     sbatch --array=10 array.sh
      -
      -

      +

      +     sbatch --array=10 array.sh
      +

    • -

      Submit a job with 10 tasks numbered consecutively from 1 to 10. +

      Submit a job with 10 tasks numbered consecutively from 1 to 10.

      -
      -     sbatch --array=1-10 array.sh
      -
      -

      +

      +     sbatch --array=1-10 array.sh
      +

    • -

      Submit a job with 5 tasks numbered consecutively with a step size of 3 (task-ids - 3,6,9,12,15) +

      Submit a job with 5 tasks numbered consecutively with a step size of 3 (task-ids + 3,6,9,12,15).

      -
      -     sbatch --array=3-15:3 array.sh
      -
      -

      +

      +     sbatch --array=3-15:3 array.sh
      +

    • -

      Submit a job with 50000 elements, where %a maps to the task-id between 1 and +

      Submit a job with 50000 elements, where %a maps to the task-id between 1 and 50K.

      -
      -     sbatch --array=1-50000 -N1 -i my_in_%a -o my_out_%a array.sh
      -
      -

    -

    Output files for Array Jobs:
    The default output and error-files are slurm-job_id_task_id.out. This means that Speed -creates an output and an error-file for each task generated by the array-job, as well as -one for the super-ordinate array-job. To alter this behavior use the -o and -e options of -sbatch.
    -

    For more details about Array Job options, please review the manual pages for sbatch by executing -the following at the command line on speed-submit man sbatch. -

    -

    -

    2.7 Requesting Multiple Cores (i.e., Multithreading Jobs)

    -

    For jobs that can take advantage of multiple machine cores, you can request up to 32 cores (per job) +

    +     sbatch --array=1-50000 -N1 -i my_in_%a -o my_out_%a array.sh
    +

    +

    Output files for Array Jobs: The default output and error-files are slurm-job_id_task_id.out. +This means that Speed creates an output and an error-file for each task generated by the array-job, as +well as one for the super-ordinate array-job. To alter this behavior use the -o and -e options of +sbatch. +

    For more details about Array Job options, please review the manual pages for sbatch by executing +the following at the command line on speed-submit. + + + +

    +
    +    man sbatch
    +

    +

    +

    +

    2.7 Requesting Multiple Cores (i.e., Multithreading Jobs)

    +

    For jobs that can take advantage of multiple machine cores, you can request up to 32 cores (per job) in your script using the following options:

    -
    -#SBATCH -n <#cores for processes>
    +
    +#SBATCH -n      #cores for processes>
     #SBATCH -n 1
    -#SBATCH -c <#cores for threads of a single process>
    -
    -

    -

    Both sbatch and salloc support -n on the command line, and it should always be used either in the -script or on the command line as the default \(n=1\).
    -

    Important Considerations:

    +#SBATCH -c      #cores for threads of a single process
    +

    +

    Both sbatch and salloc support -n on the command line, and it should always be used either in the +script or on the command line as the default \(n=1\). +

    Important Considerations:

    • Do not request more cores than you think will be useful, as larger-core jobs are more difficult to schedule. @@ -1134,19 +1228,23 @@

      If you are running a program that scales out to the maximum single-machine core count available, please request 32 cores to avoid node oversubscription (i.e., overloading the CPUs).

    -

    Note: --ntasks or --ntasks-per-node (-n) refers to processes (usually the ones run with srun). ---cpus-per-task (-c) corresponds to threads per process.
    -

    Some programs consider them equivalent, while others do not. For example, Fluent uses +

    Note:

    +
      +
    • --ntasks or --ntasks-per-node (-n) refers to processes (usually the ones run with + srun). +
    • +
    • --cpus-per-task (-c) corresponds to threads per process.
    +

    Some programs consider them equivalent, while others do not. For example, Fluent uses --ntasks-per-node=8 and --cpus-per-task=1, whereas others may set --cpus-per-task=8 and --ntasks-per-node=1. If one of these is not 1, some applications need to be configured to use n * c -total cores.
    -

    Core count associated with a job appears under, “AllocCPUS”, in the, sacct -j <job-id>, +total cores. +

    Core count associated with a job appears under,“AllocCPUS”, in the, sacct -j <job-id>, output.

    -
    +
     [serguei@speed-submit src] % squeue -l
     Thu Oct 19 20:32:32 2023
     JOBID PARTITION     NAME     USER    STATE       TIME TIME_LIMI  NODES NODELIST(REASON)
    @@ -1164,61 +1262,55 @@ 

    -

    -

    +2652.7       gydra_pmi+                speed1         20  COMPLETED      0:0

    +

    +

    -

    2.8 Interactive Jobs

    -

    Interactive job sessions allow you to interact with the system in real-time. These sessions are +

    2.8 Interactive Jobs

    +

    Interactive job sessions allow you to interact with the system in real-time. These sessions are particularly useful for tasks such as testing, debugging, optimizing code, setting up environments, and other preparatory work before submitting batch jobs. -

    +

    -
    2.8.1 Command Line
    -

    To request an interactive job session, use the salloc command with appropriate options. This is +

    2.8.1 Command Line
    +

    To request an interactive job session, use the salloc command with appropriate options. This is similar to submitting a batch job but allows you to run shell commands interactively within the allocated resources. For example:

    -
    -salloc -J interactive-test --mem=1G -p ps -n 8
    -
    -

    -

    Within the allocated salloc session, you can run shell commands as usual. It is recommended to -use srun for compute-intensive steps within salloc. If you need a quick, short job just to compile +

    +salloc -J interactive-test --mem=1G -p ps -n 8
    +

    +

    Within the allocated salloc session, you can run shell commands as usual. It is recommended to use +srun for compute-intensive steps within salloc. If you need a quick, short job just to compile something on a GPU node, you can use an interactive srun directly. For example, a 1-hour -allocation:
    -

    For tcsh: +allocation: +

    For tcsh:

    -
    -srun --pty -n 8 -p pg --gpus=1 --mem=1G -t 60 /encs/bin/tcsh
    -
    -

    -

    For bash: +

    +srun --pty -n 8 -p pg --gpus=1 --mem=1G -t 60 /encs/bin/tcsh
    +

    For bash:

    -
    -srun --pty -n 8 -p pg --gpus=1 --mem=1G -t 60 /encs/bin/bash
    -
    -

    -

    +

    +srun --pty -n 8 -p pg --gpus=1 --mem=1G -t 60 /encs/bin/bash
    +

    +

    -
    2.8.2 Graphical Applications
    -

    To run graphical UI applications (e.g., MALTLAB, Abaqus CME, IDEs like PyCharm, VSCode, +

    2.8.2 Graphical Applications
    +

    To run graphical UI applications (e.g., MALTLAB, Abaqus CME, IDEs like PyCharm, VSCode, Eclipse, etc.) on Speed, you need to enable X11 forwarding from your client machine Speed then to the compute node. To do so, follow these steps: -

    1. -

      Run an X server on your client machine:

      +

      Run an X server on your client machine:

      • Windows: Use MobaXterm with X turned on, or Xming + PuTTY with X11 forwarding, or XOrg under Cygwin @@ -1226,104 +1318,99 @@
        macOS: Use XQuarz – use its xterm and ssh -X
      • Linux: Use ssh -X speed.encs.concordia.ca
      -

      For more details, see How do I remotely launch X(Graphical) applications? +

      For more details, see How do I remotely launch X(Graphical) applications?

    2. -

      Verify that X11 forwarding is enabled by printing the DISPLAY variable: +

      Verify that X11 forwarding is enabled by printing the DISPLAY variable:

      -
      -     echo $DISPLAY
      -
      -

      +

      +         echo $DISPLAY
      +

    3. -

      Start an interactive session with X11 forwarding enabled (Use the --x11 with salloc or srun), +

      Start an interactive session with X11 forwarding enabled (Use the --x11 with salloc or srun), for example:

      -
      -     salloc -p ps --x11=first --mem=4G -t 0-06:00
      -
      -

      +

      +         salloc -p ps --x11=first --mem=4G -t 0-06:00
      +

    4. Once landed on a compute node, verify DISPLAY again.
    5. -

      Set the XDG_RUNTIME_DIR variable to a directory in your speed-scratch space: +

      Set the XDG_RUNTIME_DIR variable to a directory in your speed-scratch space:

      -
      -     mkdir -p /speed-scratch/$USER/run-dir
      -     setenv XDG_RUNTIME_DIR /speed-scratch/$USER/run-dir
      -
      -

      +

      +         mkdir -p /speed-scratch/$USER/run-dir
      +         setenv XDG_RUNTIME_DIR /speed-scratch/$USER/run-dir
      +

    6. -

      Launch your graphical application: +

      Launch your graphical application:

      -
      -     module load matlab/R2023a/default
      -     matlab
      -
      -

    -

    Note: with X11 forwarding the graphical rendering is happening on your client machine! That is +

    +         module load matlab/R2023a/default
    +         matlab
    +

    +

    Note: with X11 forwarding the graphical rendering is happening on your client machine! That is you are not using GPUs on Speed to render graphics, instead all graphical information is forwarded from Speed to your desktop or laptop over X11, which in turn renders it using its own graphics card. Thus, for GPU rendering jobs either keep them non-interactive or use -VirtualGL.
    -

    Here’s an example of starting PyCharm (see Figure 5). Note: If using VSCode, it’s currently only -supported with the --no-sandbox option.
    -

    TCSH version: - - - -

    -
    -ssh -X speed (XQuartz xterm, PuTTY or MobaXterm have X11 forwarding too)
    -[speed-submit] [/home/c/carlos] > echo $DISPLAY
    -localhost:14.0
    -[speed-submit] [/home/c/carlos] > cd /speed-scratch/$USER
    -[speed-submit] [/speed-scratch/carlos] > echo $DISPLAY
    -localhost:13.0
    -[speed-submit] [/speed-scratch/carlos] > salloc -pps --x11=first --mem=4Gb -t 0-06:00
    -[speed-07] [/speed-scratch/carlos] > echo $DISPLAY
    -localhost:42.0
    -[speed-07] [/speed-scratch/carlos] > hostname
    -speed-07.encs.concordia.ca
    -[speed-07] [/speed-scratch/carlos] > setenv XDG_RUNTIME_DIR /speed-scratch/$USER/run-dir
    -[speed-07] [/speed-scratch/carlos] > /speed-scratch/nag-public/bin/pycharm.sh
    -
    -

    BASH version: +VirtualGL. +

    Here’s an example of starting PyCharm (see Figure 5). Note: If using VSCode, it’s currently only +supported with the --no-sandbox option. +

    TCSH version:

    -
    -bash-3.2$ ssh -X speed (XQuartz xterm, PuTTY or MobaXterm have X11 forwarding too)
    -serguei@speed’s password:
    -[serguei@speed-submit ~] % echo $DISPLAY
    -localhost:14.0
    -[serguei@speed-submit ~] % salloc -p ps --x11=first --mem=4Gb -t 0-06:00
    -bash-4.4$ echo $DISPLAY
    -localhost:77.0
    -bash-4.4$ hostname
    -speed-01.encs.concordia.ca
    -bash-4.4$ export XDG_RUNTIME_DIR=/speed-scratch/$USER/run-dir
    -bash-4.4$ /speed-scratch/nag-public/bin/pycharm.sh
    -
    -

    +

    +    ssh -X speed (XQuartz xterm, PuTTY or MobaXterm have X11 forwarding too)
    +    [speed-submit] [/home/c/carlos] > echo $DISPLAY
    +    localhost:14.0
    +    [speed-submit] [/home/c/carlos] > cd /speed-scratch/$USER
    +    [speed-submit] [/speed-scratch/carlos] > echo $DISPLAY
    +    localhost:13.0
    +    [speed-submit] [/speed-scratch/carlos] > salloc -pps --x11=first --mem=4Gb -t 0-06:00
    +    [speed-07] [/speed-scratch/carlos] > echo $DISPLAY
    +    localhost:42.0
    +    [speed-07] [/speed-scratch/carlos] > hostname
    +    speed-07.encs.concordia.ca
    +    [speed-07] [/speed-scratch/carlos] > setenv XDG_RUNTIME_DIR /speed-scratch/$USER/run-dir
    +    [speed-07] [/speed-scratch/carlos] > /speed-scratch/nag-public/bin/pycharm.sh
    +

    +

    BASH version: + + +

    -
    +
    +    bash-3.2$ ssh -X speed (XQuartz xterm, PuTTY or MobaXterm have X11 forwarding too)
    +    serguei@speed’s password:
    +    [serguei@speed-submit ~] % echo $DISPLAY
    +    localhost:14.0
    +    [serguei@speed-submit ~] % salloc -p ps --x11=first --mem=4Gb -t 0-06:00
    +    bash-4.4$ echo $DISPLAY
    +    localhost:77.0
    +    bash-4.4$ hostname
    +    speed-01.encs.concordia.ca
    +    bash-4.4$ export XDG_RUNTIME_DIR=/speed-scratch/$USER/run-dir
    +    bash-4.4$ /speed-scratch/nag-public/bin/pycharm.sh
    +

    +

    +
    @@ -1332,65 +1419,62 @@
    PIC +

    PIC

    Figure 5: Launching PyCharm on a Speed Node
    -
    -
    2.8.3 Jupyter Notebooks
    -

    2.8.3.1 Jupyter Notebook in Singularity - To run Jupyter Notebooks using Singularity (more on Singularity see Section 2.16), follow these +

    +
    2.8.3 Jupyter Notebooks
    +

    2.8.3.1 Jupyter Notebook in Singularity +To run Jupyter Notebooks using Singularity (more on Singularity see Section 2.16), follow these steps: -

    +

    1. Connect to Speed, e.g. interactively, using salloc
    2. Load Singularity module module load singularity/3.10.4/default
    3. -

      Execute this Singularity command on a single line or save it in a shell script from our - GitHub where you could easily invoke it. +

      Execute this Singularity command on a single line or save it in a shell script from our + GitHub repo where you could easily invoke it.

      -
      +     
            srun singularity exec -B $PWD\:/speed-pwd,/speed-scratch/$USER\:/my-speed-scratch,/nettemp \
            --env SHELL=/bin/bash --nv /speed-scratch/nag-public/openiss-cuda-conda-jupyter.sif \
            /bin/bash -c ’/opt/conda/bin/jupyter notebook --no-browser --notebook-dir=/speed-pwd \
      -     --ip="*" --port=8888 --allow-root’
      -
      -

      + --ip="*" --port=8888 --allow-root’

      +

    4. -

      In a new terminal window, create an ssh tunnel between your computer and the node +

      In a new terminal window, create an ssh tunnel between your computer and the node (speed-XX) where Jupyter is running (using speed-submit as a “jump server”, see, e.g., in PuTTY, in Figure 6 and Figure 7)

      -
      -     ssh -L 8888:speed-XX:8888 <ENCS-username>@speed-submit.encs.concordia.ca
      -
      -

      Don’t close the tunnel after establishing. +

      +         ssh -L 8888:speed-XX:8888 <ENCS-username>@speed-submit.encs.concordia.ca
      +

      Don’t close the tunnel after establishing.

    5. -

      Open a browser, and copy your Jupyter’s token (it’s printed to you in the terminal) and paste it +

      Open a browser, and copy your Jupyter’s token (it’s printed to you in the terminal) and paste it in the browser’s URL field. In our case, the URL is:

      -
      -     http://localhost:8888/?token=5a52e6c0c7dfc111008a803e5303371ed0462d3d547ac3fb
      -
      -

      +

      +         http://localhost:8888/?token=5a52e6c0c7dfc111008a803e5303371ed0462d3d547ac3fb
      +

    6. Access the Jupyter Notebook interface in your browser.
    -
    +
    @@ -1404,8 +1488,8 @@
    2.8 -
    -
    +
    +
    @@ -1419,8 +1503,8 @@
    2.8 -
    -
    +
    +
    @@ -1429,148 +1513,125 @@
    2.8 -
    PIC
    +
    PIC
    Figure 8: Jupyter running on a Speed node
    -
    -

    Another sample is the OpenISS-derived containers with Conda and Jupyter, see Section 2.15.4 for +

    +

    Another sample is the OpenISS-derived containers with Conda and Jupyter, see Section 2.15.4 for details.

    -

    2.8.3.2 JupyterLab in Conda and Pytorch - For setting up Jupyter Labs with Conda and Pytorch, follow these steps: -

    -
      -
    • -

      Environment preparation: (only once, takes some time to run to install all required - dependencies) -

        +

        2.8.3.2 Jupyter Notebook in Conda +For setting up Jupyter Labs with Conda and Pytorch, follow these steps: +

        Environment preparation: (only once, takes some time to run to install all required +dependencies) +

        1. -

          Navigate to your speed-scratch directory: +

          Start an interactive session, and navigate to your speed-scratch directory:

          -
          -         cd /speed-scratch/\$USER
          -
          -

          -

        2. +
          +     salloc --mem=20G --gpus=1
          +     cd /speed-scratch/$USER
          +

          +

        3. -

          Create a Jupyter (name of your choice) directory and cd into it: +

          Load and initilize the environment

          -
          -         mkdir -p Jupyter
          -         cd Jupyter
          -
          -

          -

        4. +
          +     module load anaconda/2023.03/default
          +     conda init tcsh
          +     source ~/.tcshrc
          +

          +

        5. -

          Start an interactive session: - - - -

          -
          -         salloc --mem=50G --gpus=1 -ppg (or -ppt)
          -
          -

          -

        6. -
        7. -

          Set conda environment variables, and install jupyterlab and pytorch, as shown in - Figure 9 from our GitHub. -

          -
          -
          #!/encs/bin/tcsh 
          - 
          -mkdir -p /speed-scratch/$USER/Jupyter 
          -module load anaconda3/2023.03/default 
          -setenv TMPDIR /speed-scratch/$USER/tmp 
          -setenv TMP /speed-scratch/$USER/tmp 
          -setenv CONDA_PKGS_DIRS /speed-scratch/$USER/Jupyter/pkgs 
          -conda create -p /speed-scratch/$USER/Jupyter/jupyter-env -y 
          -conda activate /speed-scratch/$USER/Jupyter/jupyter-env 
          -conda install -c conda-forge jupyterlab -y 
          -pip3 install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 
          -exit
          -         
          -
          Figure 9: Source code for firsttime.sh
          -
          -
        +

        Set up Conda environment by runnung setup-conda.sh (on the compute node salloc brought + you to, not on speed-submit) as shown in Figure 9 + + + +

        +
        +      ./setup_conda.sh 
        +

        +

        +
        +
        #!/encs/bin/tcsh 
        + 
        +mkdir -p /speed-scratch/$USER/Jupyter 
        +module load anaconda3/2023.03/default 
        +setenv TMPDIR /speed-scratch/$USER/tmp 
        +setenv TMP /speed-scratch/$USER/tmp 
        +setenv CONDA_PKGS_DIRS /speed-scratch/$USER/Jupyter/pkgs 
        +conda create -p /speed-scratch/$USER/Jupyter/jupyter-env -y 
        +conda activate /speed-scratch/$USER/Jupyter/jupyter-env 
        +conda install -c conda-forge jupyterlab -y 
        +pip3 install --quiet torch --index-url https://download.pytorch.org/whl/cu118 
        +exit
        +     
        +
        Figure 9: Source code for setup_conda.sh
        +
        +
      +

      The script will:

      +
        +
      • create a Jupyter directory change Jupyter to any name of your choice in the script
      • -
      • -

        Execution of Jupyter Labs from speed-submit (repeat this every time you want to run Jupyter - Labs): -

          -
        1. -

          Start an interactive session: +

        2. set environment variables +
        3. +
        4. create a conda environment named jupyter-env +
        5. +
        6. install JupyterLabs and pytorch +
        7. +
        8. exit the interactive session
      +

      Launching Jupyter Labs instance from speed-submit: +

        +
      1. +

        Run the start_jupyterlab.sh script each time you need to launch JupyterLab from the + submit node The script will:

        -

        -
        -         salloc --mem=50G --gpus=1 -p pg (or -p pt)
        -
        -

        -

      2. -
      3. -

        Activate your conda environment and run Jupyter Labs, as shown in Figure 10 (also - available on our GitHub). -

        -
        -
        #!/encs/bin/tcsh 
        - 
        -cd /speed-scratch/$USER/Jupyter 
        -module load anaconda3/2023.03/default 
        -setenv TMPDIR /speed-scratch/$USER/tmp 
        -setenv TMP /speed-scratch/$USER/tmp 
        -setenv CONDA_PKGS_DIRS /speed-scratch/$USER/Jupyter/pkgs 
        -conda activate /speed-scratch/$USER/Jupyter/jupyter-env 
        -jupyter lab --no-browser --notebook-dir=$PWD --ip="0.0.0.0" --port=8888 --port-retries=50
        -         
        -
        Figure 10: Source code for run.sh
        -
        +
          +
        • allocate resources for your job on a compute node
        • -
        • Verify which port the system has assigned to your Jupyter Lab instance by examining - the URL http://localhost:XXXX/lab?token= in your terminal as described - previously. +
        • start jupyter server by running run_jupyterlab.sh
        • -
        • In a new terminal window, create an ssh tunnel similar to Jupyter in Singularity, see - Section 2.8.3.1. +
        • print the ssh command that you can use to connect to the compute node runnung + the jupyter notebook (this is done in a new terminal)
        • -
        • Open a browser and copy your Jupyter’s token and paste it in the browser’s URL - field
      -
    -

    2.8.3.3 JupyterLab + Pytorch in Python venv - This is an example of Jupyter Labs running in a Python Virtual environment (venv), with Pytorch -on Speed.
    -

    Note: Use of Python virtual environments is preferred over Conda at Alliance Canada +

  • print the token/link to the jupyter server to paste in a web browser (starting with + http://127.0.0.1/...)
  • + +
  • Open a browser, and copy your Jupyter’s token and paste it in the browser’s URL + field.
  • +

    2.8.3.3 Jupyter Notebook in Python venv +This is an example of Jupyter Labs running in a Python Virtual environment on Speed. +

    Note: Use of Python virtual environments is preferred over Conda at Alliance Canada clusters. If you prefer to make jobs that are more compatible between Speed and Alliance clusters, use Python venvs. See https://docs.alliancecan.ca/wiki/Anaconda/en and https://docs.alliancecan.ca/wiki/JupyterNotebook. - - -

    • -

      Environment preparation: for the FIRST time only: +

      Environment preparation: for the FIRST time only:

      1. Go to your speed-scratch directory: cd /speed-scratch/$USER
      2. Open an interactive session: salloc --mem=50G --gpus=1 --constraint=el9
      3. -

        Create a Python venv and install jupyterlab+pytorch +

        Create a Python venv and install jupyterlab+pytorch

        -
        +         
                  module load python/3.11.5/default
                  setenv TMPDIR /speed-scratch/$USER/tmp
                  setenv TMP /speed-scratch/$USER/tmp
        @@ -1579,117 +1640,107 @@ 
        2.8 source /speed-scratch/$USER/tmp/jupyter-venv/bin/activate.csh pip install jupyterlab pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 - exit -
        -

      + exit
    +

  • -

    Running Jupyter Labs, from speed-submit: +

    Running Jupyter, from speed-submit:

    1. -

      Open an interactive session: salloc --mem=50G --gpus=1 --constraint=el9 +

      Open an interactive session: salloc --mem=50G --gpus=1 --constraint=el9

      -
      +         
                cd /speed-scratch/$USER
                module load python/3.11.5/default
                setenv PIP_CACHE_DIR /speed-scratch/$USER/tmp/cache
                source /speed-scratch/$USER/tmp/jupyter-venv/bin/activate.csh
      -         jupyter lab --no-browser --notebook-dir=$PWD --ip="0.0.0.0" --port=8888 --port-retries=50
      -
      -

      + jupyter lab --no-browser --notebook-dir=$PWD --ip="0.0.0.0" --port=8888 --port-retries=50

      +

    2. -
    3. Verify which port the system has assigned to Jupyter:
      http://localhost:XXXX/lab?token= +
    4. Verify which port the system has assigned to Jupyter: http://localhost:XXXX/lab?token=
    5. SSH Tunnel creation: similar to Jupyter in Singularity, see Section 2.8.3.1
    6. Open a browser and type: localhost:XXXX (using the port assigned)
  • -

    +

    -
    2.8.4 Visual Studio Code
    -

    This is an example of running VScode, it’s similar to Jupyter notebooks, but it doesn’t use containers. +

    2.8.4 Visual Studio Code
    +

    This is an example of running VScode, it’s similar to Jupyter notebooks, but it doesn’t use containers. Note: this a Web-based version; there exists the local (workstation) – remote (speed-node) client-server version too, but it is for advanced users and is out of scope here (so no support, use it at your own risk).

    • -

      Environment preparation: for the FIRST time: +

      Environment preparation: for the FIRST time:

      1. Go to your speed-scratch directory: cd /speed-scratch/$USER
      2. Create a vscode directory: mkdir vscode - - - -
      3. -
      4. Go to vscode: cd vscode
      5. -
      6. Create home and projects: mkdir {home,projects} -
      7. -
      8. Create this directory: mkdir -p /speed-scratch/$USER/run-user
      +
    • Create another directory: mkdir -p /speed-scratch/$USER/run-user
    • -

      Running VScode +

      Running VScode

        -
      1. Go to your vscode directory: cd /speed-scratch/$USER/vscode +
      2. Go to your vscode directory: cd /speed-scratch/$USER/vscode
      3. -
      4. Open interactive session: salloc --mem=10Gb --constraint=el9 +
      5. Open interactive session: salloc --mem=10Gb --constraint=el9
      6. -
      7. Set environment variable:
        setenv XDG_RUNTIME_DIR /speed-scratch/$USER/run-user +
      8. Set environment variable: setenv XDG_RUNTIME_DIR + /speed-scratch/$USER/run-user
      9. -
      10. -

        Run VScode, change the port if needed. +

      11. +

        Run VScode, change the port if needed.

        -
        +         
                  /speed-scratch/nag-public/code-server-4.22.1/bin/code-server --user-data-dir=$PWD\/projects \
                  --config=$PWD\/home/.config/code-server/config.yaml --bind-addr="0.0.0.0:8080" $PWD\/projects
        -
        -

        +     

        +

      12. -
      13. SSH Tunnel creation: similar to Jupyter, see Section 2.8.3.1 +
      14. SSH Tunnel creation: similar to Jupyter, see Section 2.8.3.1
      15. -
      16. Open a browser and type: localhost:8080 +
      17. Open a browser and type: localhost:8080
      18. -
      19. -

        If the browser asks for a password, consult: +

      20. +

        If the browser asks for a password, consult:

        -
        -         cat /speed-scratch/$USER/vscode/home/.config/code-server/config.yaml
        -
        -

        -

        -
      +
      +             cat /speed-scratch/$USER/vscode/home/.config/code-server/config.yaml
      +             
      +

    -
    +
    - + -
    PIC
    -
    Figure 11: VScode running on a Speed node
    +
    PIC
    +
    Figure 10: VScode running on a Speed node
    -
    -

    2.9 Scheduler Environment Variables

    -

    The scheduler provides several environment variables that can be useful in your job scripts. These +

    +

    2.9 Scheduler Environment Variables

    +

    The scheduler provides several environment variables that can be useful in your job scripts. These variables can be accessed within the job using commands like env or printenv. Many of these -variables start with the prefix SLURM.
    -

    Here are some of the most useful environment variables: +variables start with the prefix SLURM. +

    Here are some of the most useful environment variables:

    • $TMPDIR (and $SLURM_TMPDIR): This is the path to the job’s temporary space on the node. @@ -1710,20 +1761,23 @@

      $SLURM_ARRAY_TASK_ID: For array jobs, this variable represents the task ID (refer to Section 2.6 for more details on array jobs).

    -

    For a more comprehensive list of environment variables, refer to the SLURM documentation for Input -Environment Variables and Output Environment Variables.
    -

    An example script that utilizes some of these environment variables is in Figure 12. +

    For a more comprehensive list of environment variables, refer to the SLURM documentation for Input +Environment Variables and Output Environment Variables. +

    An example script that utilizes some of these environment variables is in Figure 11.

    -
    + + + +
    - + -
    #!/encs/bin/tcsh 
    +
    #!/encs/bin/tcsh 
      
     #SBATCH --job-name=tmpdir      ## Give the job a name 
     #SBATCH --mail-type=ALL        ## Receive all email type notifications 
    @@ -1740,234 +1794,229 @@ 

    srun STAR --inFiles $TMPDIR/input --parallel $SRUN_CPUS_PER_TASK --outFiles $TMPDIR/results rsync -av $TMPDIR/results/ $SLURM_SUBMIT_DIR/processed/

    -
    Figure 12: Source code for tmpdir.sh
    +
    Figure 11: Source code for tmpdir.sh
    -
    -

    2.10 SSH Keys for MPI

    -

    Some programs, such as Fluent, utilize MPI (Message Passing Interface) for parallel processing. MPI +

    +

    2.10 SSH Keys for MPI

    +

    Some programs, such as Fluent, utilize MPI (Message Passing Interface) for parallel processing. MPI requires ‘passwordless login’, which is achieved through SSH keys. Here are the steps to set up SSH keys for MPI:

    • -

      Navigate to the .ssh directory +

      Navigate to the .ssh directory

      -
      -     cd ~/.ssh
      -
      -

      +

      +     cd ~/.ssh
      +

    • -

      Generate a new SSH key pair (Accept the default location and leave the passphrase +

      Generate a new SSH key pair (Accept the default location and leave the passphrase blank)

      -
      -     ssh-keygen -t ed25519
      -
      -

      +

      +     ssh-keygen -t ed25519
      +

    • -

      Authorize the Public Key: +

      Authorize the Public Key:

      -
      -     cat id_ed25519.pub >> authorized_keys
      -
      -

      If the authorized_keys file does not exist, use +

      +     cat id_ed25519.pub >> authorized_keys
      +

      If the authorized_keys file does not exist, use

      -
      -     cat id_ed25519.pub > authorized_keys
      -
      -

      +

      +     cat id_ed25519.pub > authorized_keys
      +

    • -

      Set permissions: ensure the correct permissions are set for the ‘authorized_keys’ file and your +

      Set permissions: ensure the correct permissions are set for the ‘authorized_keys’ file and your home directory (most users will already have these permissions by default):

      -
      +     
            chmod 600 ~/.ssh/authorized_keys
      -     chmod 700 ~
      -
      -

    -

    + chmod 700 ~ +

    +

    -

    2.11 Creating Virtual Environments

    -

    The following documentation is specific to Speed. Other clusters may have their own requirements. -Virtual environments are typically created using Conda or Python. Another option is Singularity +

    2.11 Creating Virtual Environments

    +

    The following documentation is specific to Speed, other clusters may have their own requirements. +

    Virtual environments are typically created using Conda or Python. Another option is Singularity (detailed in Section 2.16). These environments are usually created once during an interactive session -before submitting a batch job to the scheduler. The job script submitted to the scheduler -should: +before submitting a batch job to the scheduler. +

    The job script submitted to the scheduler should:

    1. Activate the virtual environment.
    2. Use the virtual environment.
    3. Deactivate the virtual environment at the end of the job.
    -

    +

    -
    2.11.1 Anaconda
    -

    To create an Anaconda environment, follow these steps: +

    2.11.1 Anaconda
    +

    To create an Anaconda environment, follow these steps:

    1. -

      Request an interactive session + + + +

      Request an interactive session

      -
      -     salloc -p pg --gpus=1
      -
      -

      +

      +     salloc -p pg --gpus=1
      +

    2. -

      Load the Anaconda module and create your Anaconda environment in your speed-scratch - directory by using the --prefix option (without this option, the environment will be created in +

      Load the Anaconda module and create your Anaconda environment in your speed-scratch + directory by using the --prefix option (without this o tion, the environment will be created in your home directory by default).

      -
      +     
            module load anaconda3/2023.03/default
      -     conda create --prefix /speed-scratch/$USER/myconda
      -
      -

      + conda create --prefix /speed-scratch/$USER/myconda

      +

    3. -

      List environments (to view your conda environment) +

      List environments (to view your conda environment)

      -
      +     
            conda info --envs
            # conda environments:
            #
            base                  *  /encs/pkg/anaconda3-2023.03/root
      -                                /speed-scratch/a_user/myconda
      -
      -

      +                            /speed-scratch/a_user/myconda

      +

    4. -

      Activate the environment +

      Activate the environment

      -
      -     conda activate /speed-scratch/$USER/myconda
      -
      -

      +

      +     conda activate /speed-scratch/$USER/myconda
      +

    5. -

      Add pip to your environment (this will install pip and pip’s dependencies, including python, +

      Add pip to your environment (this will install pip and pip’s dependencies, including python, into the environment.)

      -
      -     conda install pip
      -
      -

    -

    A consolidated example using Conda: +

    +     conda install pip
    +

    +

    A consolidated example using Conda:

    -
    -salloc -p pg --gpus=1 --mem=10G -A <slurm account name>
    -cd /speed-scratch/$USER
    -module load python/3.11.0/default
    -conda create -p /speed-scratch/$USER/pytorch-env
    -conda activate /speed-scratch/$USER/pytorch-env
    -conda install python=3.11.0
    -pip3 install torch torchvision torchaudio --index-url \
    -  https://download.pytorch.org/whl/cu117
    -....
    -conda deactivate
    -exit # end the salloc session
    -
    -

    -

    If you encounter no space left error while creating Conda environments, please refer to -Appendix B.3. Likely you forgot --prefix or environment variables below.
    -

    Important Note: pip (and pip3) are package installers for Python. When you use pip install, it +

    +    salloc --mem=10G --gpus=1 -p pg -A <slurm account name>
    +    mkdir -p /speed-scratch/$USER
    +    cd /speed-scratch/$USER
    +    module load anaconda3/2023.03/default
    +    conda create -p /speed-scratch/$USER/pytorch-env
    +    conda activate /speed-scratch/$USER/pytorch-env
    +    conda install python=3.11.0
    +    pip3 install torch torchvision torchaudio --index-url \
    +    https://download.pytorch.org/whl/cu117
    +    ....
    +    conda deactivate
    +    exit # end the salloc session
    +

    +

    If you encounter no space left error while creating Conda environments, please refer to +Appendix B.3. Likely you forgot --prefix or environment variables below. +

    Important Note: pip (and pip3) are package installers for Python. When you use pip install, it installs packages from the Python Package Index (PyPI), whereas, conda install installs packages from Anaconda’s repository.

    -

    2.11.1.1 Conda Env without --prefix - If you don’t want to use the --prefix option every time you create a new environment and do not +

    2.11.1.1 Conda Env without --prefix +If you don’t want to use the --prefix option every time you create a new environment and do not want to use the default home directory, you can create a new directory and set the following variables to point to the newly created directory, e.g.:

    -
    -mkdir -p /speed-scratch/$USER/conda
    -setenv CONDA_ENVS_PATH /speed-scratch/$USER/conda
    -setenv CONDA_PKGS_DIRS /speed-scratch/$USER/conda/pkg
    -
    -

    If you want to make these changes permanent, add the variables to your .tcshrc or .bashrc +

    +    mkdir -p /speed-scratch/$USER/conda
    +    setenv CONDA_ENVS_PATH /speed-scratch/$USER/conda
    +    setenv CONDA_PKGS_DIRS /speed-scratch/$USER/conda/pkg
    +

    +

    If you want to make these changes permanent, add the variables to your .tcshrc or .bashrc (depending on the default shell you are using). -

    +

    -
    2.11.2 Python
    -

    Setting up a Python virtual environment is straightforward. Here’s an example that use a Python +

    2.11.2 Python
    +

    Setting up a Python virtual environment is straightforward. Here’s an example that use a Python virtual environment:

    -
    -salloc -p pg --gpus=1 --mem=10G -A <slurm account name>
    -cd /speed-scratch/$USER
    -module load python/3.9.1/default
    -mkdir -p /speed-scratch/$USER/tmp
    -setenv TMPDIR /speed-scratch/$USER/tmp
    -setenv TMP /speed-scratch/$USER/tmp
    -python -m venv $TMPDIR/testenv (testenv=name of the virtualEnv)
    -source /speed-scratch/$USER/tmp/testenv/bin/activate.csh
    -pip install modules...
    -deactivate
    -exit
    -
    -

    -

    See, e.g., gurobi-with-python.sh
    -

    Important Note: our partition ps is used for CPU jobs, while pg, pt, and cl are used +

    +    salloc --mem=10G --gpus=1 -p pg -A <slurm account name>
    +    mkdir -p /speed-scratch/$USER
    +    cd /speed-scratch/$USER
    +    module load python/3.9.1/default
    +    mkdir -p /speed-scratch/$USER/tmp
    +    setenv TMPDIR /speed-scratch/$USER/tmp
    +    setenv TMP /speed-scratch/$USER/tmp
    +    python -m venv $TMPDIR/testenv (testenv=name of the virtualEnv)
    +    source /speed-scratch/$USER/tmp/testenv/bin/activate.csh
    +    pip install <modules>
    +    deactivate
    +    exit
    +

    +

    See, e.g., gurobi-with-python.sh +

    Important Note: our partition ps is used for CPU jobs, while pg, pt, and cl are used for GPU jobs. You do not need to use --gpus when preparing environments for CPU -jobs.
    -

    Note: Python enviornments are also preferred over Conda in some clusters, see a note -in Section 2.8.3.3. -

    +jobs. +

    Note: Python enviornments are also preferred over Conda in some clusters, see a note +in Section 2.8.3.3. + + +

    -

    2.12 Example Job Script: Fluent

    -
    +

    2.12 Example Job Script: Fluent

    +
    - + -
    #!/encs/bin/tcsh 
    +
    #!/encs/bin/tcsh 
      
     #SBATCH --job-name=flu10000    ## Give the job a name 
     #SBATCH --mail-type=ALL        ## Receive all email type notifications 
    @@ -1981,7 +2030,7 @@ 

    module avail ansys -module load ansys/19.2/default +module load ansys/2023R2/default cd $TMPDIR set FLUENTNODES = "‘scontrol␣show␣hostnames‘" @@ -2000,12 +2049,12 @@

    date

    -
    Figure 13: Source code for fluent.sh
    +
    Figure 12: Source code for fluent.sh
    -
    -

    The job script in Figure 13 runs Fluent in parallel over 32 cores. Notable aspects of this script +

    +

    The job script in Figure 12 runs Fluent in parallel over 32 cores. Notable aspects of this script include requesting e-mail notifications (--mail-type), defining the parallel environment for Fluent with -t$SLURM_NTASKS and -g-cnf=$FLUENTNODES, and setting $TMPDIR as the in-job location for the “moment” rfile.out file. The script also copies everything @@ -2014,56 +2063,59 @@

    /disk/nobackup/<yourjob> (it starts with your job-ID)) on the node running the job. Be cautious with journal-file paths. + + + + + +

    -

    2.13 Example Job: EfficientDet

    -

    The following steps describe how to create an EfficientDet environment on Speed, as submitted by a +

    2.13 Example Job: EfficientDet

    +

    The following steps describe how to create an EfficientDet environment on Speed, as submitted by a member of Dr. Amer’s research group:

    • -

      Navigate to your speed-scratch directory: +

      Navigate to your speed-scratch directory:

      -
      +     
            cd /speed-scratch/$USER
      -       
      -
      -

      +   

      +

    • -

      Load Python module +

      Load Python module

      -
      +     
            module load python/3.8.3
      -       
      -
      -

      +   

      +

    • -

      Create and activate the virtual environment +

      Create and activate the virtual environment

      -
      +     
            python3 -m venv <env_name>
            source <env_name>/bin/activate.csh
      -       
      -
      -

      +   

      +

    • -

      Install DL packages for EfficientDet +

      Install DL packages for EfficientDet

      -
      +     
            pip install tensorflow==2.7.0
            pip install lxml>=4.6.1
            pip install absl-py>=0.10.0
      @@ -2077,13 +2129,12 @@ 

      -

    -

    + pip install git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI +

    +

    -

    2.14 Java Jobs

    -

    Jobs that call Java have a memory overhead, which needs to be taken into account when assigning a +

    2.14 Java Jobs

    +

    Jobs that call Java have a memory overhead, which needs to be taken into account when assigning a value to --mem. Even the most basic Java call, such as Java -Xmx1G -version, will need to have, --mem=5G, with the 4 GB difference representing the memory overhead. Note that this memory overhead grows proportionally with the value of -Xmx. For example, @@ -2097,10 +2148,10 @@

    2.14 -

    +

    -

    2.15 Scheduling on the GPU Nodes

    -

    Speed has various GPU types in various subclusters of its nodes. +

    2.15 Scheduling on the GPU Nodes

    +

    Speed has various GPU types in various subclusters of its nodes.

    • speed-05 and speed-17: The primary SPEED1 cluster has two GPU nodes, each with six @@ -2126,72 +2177,67 @@

      speed-19: Has an AMD GPU, Tonga, 16GB of GPU ram. This node along with the majority of the NVIDIA GPU nodes are in the cl partition (with restrictions) to run OpenCL, Vulkan, and HIP jobs.

    -

    Job scripts for the GPU queues differ in that they need these statements, which attach either a single +

    Job scripts for the GPU queues differ in that they need these statements, which attach either a single GPU or more GPUs to the job with the appropriate partition:

    -
    +
       #SBATCH --gpus=[1|x]
    -  #SBATCH -p [pg|pt|cl|pa]
    -
    -

    The default max quota for \(x\) is 4. -

    Once your job script is ready, submit it to the GPU partition (queue) with: +  #SBATCH -p [pg|pt|cl|pa]

    +

    The default max quota for \(x\) is 4. +

    Once your job script is ready, submit it to the GPU partition (queue) with:

    -
    -  sbatch --mem=<MEMORY> -p pg ./<myscript>.sh
    -
    -

    --mem and -p can reside in the script. -

    You can query nvidia-smi on the node running your job with: +

    +  sbatch --mem=<MEMORY> -p pg ./<myscript>.sh
    +

    --mem and -p can reside in the script. +

    You can query nvidia-smi on the node running your job with:

    -
    -  ssh <ENCSusername>@speed-[01|03|05|17|25|27|37-43]|nebulae nvidia-smi
    -
    -

    -

    The status of the GPU queues can be queried e.g. with: +

    +  ssh <ENCSusername>@speed-[01|03|05|17|25|27|37-43]|nebulae nvidia-smi
    +

    +

    The status of the GPU queues can be queried e.g. with:

    -
    +
       sinfo -p pg --long --Node
       sinfo -p pt --long --Node
       sinfo -p cl --long --Node
       sinfo -p pa --long --Node
    -  sinfo -p pn --long --Node
    -
    -

    -

    You can query rocm-smi on the AMD GPU node running your job with: +  sinfo -p pn --long --Node

    +

    +

    You can query rocm-smi on the AMD GPU node running your job with:

    -
    -  ssh <ENCSusername>@speed-19 rocm-smi
    -
    -

    -

    Important note for TensorFlow and PyTorch users: if you are planning to run +

    +  ssh <ENCSusername>@speed-19 rocm-smi
    +

    +

    Important note for TensorFlow and PyTorch users: if you are planning to run TensorFlow and/or PyTorch multi-GPU jobs, please do not use the tf.distribute and/or torch.nn.DataParallel functions on speed-01, speed-05, or speed-17, as they will crash the compute node (100% certainty). This appears to be a defect in the current hardware architecture. The workaround is to either manually effect GPU parallelisation (see Section 2.15.1) (TensorFlow provides an example on how to do this), or to run on a single GPU, which is now the default for those nodes.
    -

    Important: Users without permission to use the GPU nodes can submit jobs to the various +

    Important: Users without permission to use the GPU nodes can submit jobs to the various GPU partitions, but those jobs will hang and never run. Their availability can be seen with:

    -
    +
     [serguei@speed-submit src] % sinfo -p pg --long --Node
     Thu Oct 19 22:31:04 2023
     NODELIST   NODES PARTITION       STATE CPUS    S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON
    @@ -2208,24 +2254,22 @@ 

    -

    To specifically request a GPU node, add, --gpus=[#GPUs], to your sbatch statement/script or +speed-43       1        pt        idle 256    2:64:2 980275        0      1 gpu20,mi none

    +

    +

    To specifically request a GPU node, add, --gpus=[#GPUs], to your sbatch statement/script or salloc statement request. For example:

    -
    -  sbatch -t 10 --mem=1G --gpus=1 -p pg ./tcsh.sh
    -
    -

    The request can be further specified to a specific node using -w or a GPU type or feature. +

    +  sbatch -t 10 --mem=1G --gpus=1 -p pg ./tcsh.sh
    +

    The request can be further specified to a specific node using -w or a GPU type or feature.

    -
    +
     [serguei@speed-submit src] % squeue -p pg -o "%15N %.6D %7P %.11T %.4c %.8z %.6m %.8d %.6w %.8f %20G %20E"
     NODELIST         NODES PARTITI       STATE MIN_    S:C:T MIN_ME MIN_TMP_  WCKEY FEATURES GROUP DEPENDENCY
     speed-05             1 pg          RUNNING    1    *:*:*     1G        0 (null)   (null) 11929     (null)
    @@ -2233,13 +2277,12 @@ 

    -

    +speed-[25,27]        2 pg             idle   32   2:16:1 257458        0      1    gpu32 gpu:2       none

    +

    +

    -
    2.15.1 P6 on Multi-GPU, Multi-Node
    -

    As described earlier, P6 cards are not compatible with Distribute and DataParallel functions +

    2.15.1 P6 on Multi-GPU, Multi-Node
    +

    As described earlier, P6 cards are not compatible with Distribute and DataParallel functions (PyTorch, Tensorflow) when running on multiple GPUs. One workaround is to run the job in Multi-node, single GPU per node (this applies to P6 nodes: speed-05, speed-17, speed-01): @@ -2247,116 +2290,111 @@

    +
       #SBATCH --nodes=2
    -  #SBATCH --gpus-per-node=1
    -
    -

    -

    An example script for training on multiple nodes with multiple GPUs is provided in +  #SBATCH --gpus-per-node=1

    +

    +

    An example script for training on multiple nodes with multiple GPUs is provided in pytorch-multinode-multigpu.sh illustrates a job for training on Multi-Nodes, Multi-GPUs -

    +

    -
    2.15.2 CUDA
    -

    When calling CUDA within job scripts, it is important to link to the desired the desired CUDA +

    2.15.2 CUDA
    +

    When calling CUDA within job scripts, it is important to link to the desired the desired CUDA libraries and set the runtime link path to the same libraries. For example, to use the cuda-11.5 libraries, specify the following in your Makefile.

    -
    -  -L/encs/pkg/cuda-11.5/root/lib64 -Wl,-rpath,/encs/pkg/cuda-11.5/root/lib64
    -
    -

    -

    In your job script, specify the version of GCC to use prior to calling CUDA: +

    +  -L/encs/pkg/cuda-11.5/root/lib64 -Wl,-rpath,/encs/pkg/cuda-11.5/root/lib64
    +

    +

    In your job script, specify the version of GCC to use prior to calling CUDA:

    -
    -  module load gcc/9.3
    -
    -

    -

    +

    +  module load gcc/9.3
    +

    +

    -
    2.15.3 Special Notes for Sending CUDA Jobs to the GPU Queues
    -

    Interactive jobs (Section 2.8) must be submitted to the GPU partition to compile and link. Several +

    2.15.3 Special Notes for Sending CUDA Jobs to the GPU Queues
    +

    Interactive jobs (Section 2.8) must be submitted to the GPU partition to compile and link. Several versions of CUDA are installed in:

    -
    +
       /encs/pkg/cuda-11.5/root/
       /encs/pkg/cuda-10.2/root/
    -  /encs/pkg/cuda-9.2/root
    -
    -

    -

    For CUDA to compile properly for the GPU partition, edit your Makefile replacing usrlocalcuda +  /encs/pkg/cuda-9.2/root

    +

    +

    For CUDA to compile properly for the GPU partition, edit your Makefile replacing usrlocalcuda with one of the above. -

    +

    -
    2.15.4 OpenISS Examples
    -

    These examples represent more comprehensive research-like jobs for computer vision and other tasks +

    2.15.4 OpenISS Examples
    +

    These examples represent more comprehensive research-like jobs for computer vision and other tasks with longer runtime (subject to the number of epochs and other parameters). They derive from the actual research works of students and their theses and require the use of CUDA and GPUs. These examples are available as “native” jobs on Speed and as Singularity containers. -

    Examples include: +

    Examples include:

    -

    2.15.4.1 OpenISS and REID - A computer-vision-based person re-identification (e.g., motion capture-based tracking for stage +

    2.15.4.1 OpenISS and REID +A computer-vision-based person re-identification (e.g., motion capture-based tracking for stage performance) part of the OpenISS project by Haotao Lai [12] using TensorFlow and Keras. The script is available here: openiss-reid-speed.sh. The fork of the original repo [14] adjusted to run on Speed is available here: openiss-reid-tfk. Detailed instructions on how to run it on Speed are in the README: https://github.com/NAG-DevOps/speed-hpc/tree/master/src#openiss-reid-tfk

    -

    2.15.4.2 OpenISS and YOLOv3 - The related code using YOLOv3 framework is in the the fork of the original repo [13] adjusted to -to run on Speed is available here: openiss-yolov3.
    -

    Example job scripts can run on both CPUs and GPUs, as well as interactively using TensorFlow: +

    2.15.4.2 OpenISS and YOLOv3 +The related code using YOLOv3 framework is in the the fork of the original repo [13] adjusted to to +run on Speed is available here: openiss-yolov3.
    +

    Example job scripts can run on both CPUs and GPUs, as well as interactively using TensorFlow:

    + + + -

    Detailed instructions on how to run these on Speed are in the README: +

    Detailed instructions on how to run these on Speed are in the README: https://github.com/NAG-DevOps/speed-hpc/tree/master/src#openiss-yolov3 -

    +

    -

    2.16 Singularity Containers

    -

    Singularity is a container platform designed to execute applications in a portable, reproducible, and +

    2.16 Singularity Containers

    +

    Singularity is a container platform designed to execute applications in a portable, reproducible, and secure manner. Unlike Docker, Singularity does not require root privileges, making it more suitable for HPC environments. If the /encs software tree does not have the required software available, another option is to run Singularity containers. We run EL7 and EL9 flavors of Linux, and if some projects require Ubuntu or other distributions, it is possible to run that software as a container, including those converted from Docker. The currently recommended version of Singularity is singularity/3.10.4/default.
    -

    The example lambdal-singularity.sh showcases an immediate use of a container built for the +

    The example lambdal-singularity.sh showcases an immediate use of a container built for the Ubuntu-based LambdaLabs software stack, originally built as a Docker image then pulled in as a Singularity container. The source material used for the docker image was our fork of their official repository: https://github.com/NAG-DevOps/lambda-stack-dockerfiles.
    -

    Note: If you make your own containers or pull from DockerHub, use your /speed-scratch/$USER +

    Note: If you make your own containers or pull from DockerHub, use your /speed-scratch/$USER directory, as these images may easily consume gigabytes of space in your home directory, quickly exhausting your quota.
    -

    Tip: To check your quota and find big files, see Section B.3 and ENCS Data Storage.
    -

    We have also built equivalent OpenISS (Section 2.15.4) containers from their -Docker counterparts for teaching and research purposes [16]. The images from -https://github.com/NAG-DevOps/openiss-dockerfiles and their DockerHub equivalents -https://hub.docker.com/u/openiss can be found in /speed-scratch/nag-public with a ‘.sif’ -extension. Some can be run in both batch and interactive modes, covering basics with CUDA, -OpenGL rendering, and computer vision tasks. Examples include Jupyter notebooks with Conda -support. +

    Tip: To check your quota and find big files, see Section B.3 and ENCS Data Storage.
    +

    We have also built equivalent OpenISS (Section 2.15.4) containers from their Docker counterparts for teaching +and research purposes [16]. The images from https://github.com/NAG-DevOps/openiss-dockerfiles +and their DockerHub equivalents https://hub.docker.com/u/openiss can be found in +/speed-scratch/nag-public with a ‘.sif’ extension. Some can be run in both batch and interactive +modes, covering basics with CUDA, OpenGL rendering, and computer vision tasks. Examples include +Jupyter notebooks with Conda support.

    -
    +
       /speed-scratch/nag-public:
       openiss-cuda-conda-jupyter.sif
       openiss-cuda-devicequery.sif
    @@ -2364,13 +2402,12 @@ 

    2   openiss-opengl-cubes.sif   openiss-opengl-triangle.sif   openiss-reid.sif -  openiss-xeyes.sif -

    -

    -

    This section introduces working with Singularity, its containers, and what can and cannot be done +  openiss-xeyes.sif

    +

    +

    This section introduces working with Singularity, its containers, and what can and cannot be done with Singularity on the ENCS infrastructure. For comprehensive documentation, refer to the authors’ guide: https://www.sylabs.io/docs/.
    -

    Singularity containers are either built from an existing container, or from scratch. Building from +

    Singularity containers are either built from an existing container, or from scratch. Building from scratch requires a recipe file (think of like a Dockerfile) and must be done with root permissions, which are not available on the ENCS infrastructure. Therefore, built-from-scratch containers must be created on a user-managed/personal system. There are three types of Singularity @@ -2385,38 +2422,40 @@

    2
  • Squashfs containers: read-only compressed “file” and are read-only. It is the default build type.
  • -

    “A common workflow is to use the “sandbox” mode for container development and then build it as a +

    “A common workflow is to use the “sandbox” mode for container development and then build it as a default (squashfs) Singularity image when done.” says the Singularity’s authors about builds. File-system containers are considered legacy and are not commonly used.
    -

    For many workflows, a Docker container might already exist. In this case, you can use + + + +

    For many workflows, a Docker container might already exist. In this case, you can use Singularity’s docker pull function as part of your virtual environment setup in an interactive job allocation:

    -
    +
       salloc --gpus=1 -n8 --mem=4Gb -t60
       cd /speed-scratch/$USER/
       singularity pull openiss-cuda-devicequery.sif docker://openiss/openiss-cuda-devicequery
       INFO:    Converting OCI blobs to SIF format
    -  INFO:    Starting build...
    -
    -

    -

    This method can be used for converting Docker containers directly on Speed. On GPU nodes, make +  INFO:    Starting build...

    +

    +

    This method can be used for converting Docker containers directly on Speed. On GPU nodes, make sure to pass on the --nv flag to Singularity so its containers could access the GPUs. See the linked example for more details. -

    +

    -

    3 Conclusion

    -

    The cluster operates on a “first-come, first-served” basis until it reaches full capacity. After that, job +

    3 Conclusion

    +

    The cluster operates on a “first-come, first-served” basis until it reaches full capacity. After that, job positions in the queue are determined based on past usage. The scheduler does attempt to fill gaps, so occasionally, a single-core job with lower priority may be scheduled before a multi-core job with higher priority. -

    +

    -

    3.1 Important Limitations

    -

    While Speed is a powerful tool, it is essential to recognize its limitations to use it effectively: +

    3.1 Important Limitations

    +

    While Speed is a powerful tool, it is essential to recognize its limitations to use it effectively:

    • New users are limited to a total of 32 cores and 4 GPUs. If you need more cores @@ -2424,30 +2463,29 @@

      3.

    • Batch job sessions can run for a maximum of one week. Interactive jobs are limited to 24 hours see Section 2.8. -
    • -
    • -

      Scripts can live in your NFS-provided home directory, but substantial data should be - stored in your cluster-specific directory (located at /speed-scratch/<ENCSusername>/). -

      NFS is suitable for short-term activities but not for long-term operations. Data that a - job will read multiple times should be copied at the start to the scratch disk of a - compute node using $TMPDIR (and possibly $SLURM_SUBMIT_DIR). Intermediate job data - should be produced in $TMPDIR, and once a job is near completion, these data should be - copied to your NFS-mounted home directory (or other NFS-mounted space). In other - words, IO-intensive operations should be performed locally whenever possible, - reserving network activity for the start and end of jobs. -

    • + +
    • Scripts can live in your NFS-provided home directory, but substantial data should be + stored in your cluster-specific directory (located at /speed-scratch/<USER>/). NFS is + suitable for short-term activities but not for long-term operations. Data that a job will + read multiple times should be copied at the start to the scratch disk of a compute + node using $TMPDIR (and possibly $SLURM_SUBMIT_DIR). Intermediate job data should + be produced in $TMPDIR, and once a job is near completion, these data should be copied + to your NFS-mounted home directory (or other NFS-mounted space). In other words, + IO-intensive operations should be performed locally whenever possible, reserving network + activity for the start and end of jobs. +
    • Your current resource allocation is based on past usage, which considers approximately one week’s worth of past wall clock time (time spent on the node(s)) and compute activity (on the node(s)).
    • Jobs must always be run within the scheduler’s system. Repeat offenders who run jobs outside the scheduler risk losing cluster access.
    -

    +

    -

    3.2 Tips/Tricks

    +

    3.2 Tips/Tricks

    • Ensure that files and scripts have Linux line breaks. Use the file command to verify and dos2unix to convert if necessary. @@ -2457,7 +2495,7 @@

      3.2
    • Before transferring a large number of files between NFS-mounted storage and the cluster, compress the files into a tar archive.
    • -
    • If you plan to use a different shell (e.g., bash [27]), change the shell declaration at the +
    • If you plan to use a different shell (e.g., bash [26]), change the shell declaration at the beginning of your script(s).
    • Request resources (cores, memory, GPUs) that closely match the actual needs of your job. @@ -2466,35 +2504,35 @@

      3.2 --mail-type=ALL, to adjust your job parameters.

    • For any concerns or questions, email rt-ex-hpc AT encs.concordia.ca
    -

    +

    -

    3.3 Use Cases

    +

    3.3 Use Cases

    • -

      HPC Committee’s initial batch about 6 students (end of 2019):

      +

      HPC Committee’s initial batch about 6 students (end of 2019):

      • 10000 iterations job in Fluent finished in \(<26\) hours vs. 46 hours in Calcul Quebec
    • -

      NAG’s MAC spoofer analyzer [2019], such as https://github.com/smokhov/atsm/tree/master/examples/flucid +

      NAG’s MAC spoofer analyzer [2019], such as https://github.com/smokhov/atsm/tree/master/examples/flucid

      • compilation of forensic computing reasoning cases about false or true positives of hardware address spoofing in the labs
    • -

      S4 LAB/GIPSY R&D Group’s:

      +

      S4 LAB/GIPSY R&D Group’s:

      • MARFCAT and MARFPCAT (OSS signal processing and machine learning tools for - vulnerable and weak code analysis and network packet capture analysis) [22176] + vulnerable and weak code analysis and network packet capture analysis) [21176] + + +
      • Web service data conversion and analysis
      • Forensic Lucid encoders (translation of large log data into Forensic Lucid [18] for forensic analysis) - - -
      • Genomic alignment exercises
    • @@ -2529,9 +2567,6 @@

      3.3 efficient object tracking. In IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), Waikoloa, Hawaii, January 2024. https://arxiv.org/abs/2309.03979 and https://github.com/goutamyg/SMAT - - -
    • Goutam Yelluru Gopal and Maria Amer. Mobile vision transformer-based visual object tracking. In 34th British Machine Vision Conference (BMVC), Aberdeen, UK, November 2023. @@ -2561,138 +2596,130 @@

      3.3 https://doi.org/10.1177/0278364920913945

    • -

      The work “Haotao Lai. An OpenISS framework specialization for deep learning-based +

      The work “Haotao Lai. An OpenISS framework specialization for deep learning-based person re-identification. Master’s thesis, Department of Computer Science and Software Engineering, Concordia University, Montreal, Canada, August 2019. https://spectrum.library.concordia.ca/id/eprint/985788/” using TensorFlow and Keras - on OpenISS adjusted to run on Speed based on the repositories:

      - - - + on OpenISS adjusted to run on Speed based on the repositories, and theirs forks by the team: +

      -

      and theirs forks by the team.

    -

    + +

    -

    A History

    -

    +

    A History

    +

    -

    A.1 Acknowledgments

    +

    A.1 Acknowledgments

    • The first 6 to 6.5 versions of this manual and early UGE job script samples, Singularity - testing and user support were produced/done by Dr. Scott Bunnell during his time at + testing,and user support were produced/done by Dr. Scott Bunnell during his time at Concordia as a part of the NAG/HPC group. We thank him for his contributions.
    • The HTML version with devcontainer support was contributed by Anh H Nguyen. + + +
    • Dr. Tariq Daradkeh, was our IT Instructional Specialist from August 2022 to September 2023; working on the scheduler, scheduling research, end user support, and integration of examples, such as YOLOv3 in Section 2.15.4.2 and other tasks. We have a continued collaboration on HPC/scheduling research (see [8]).
    -

    +

    -

    A.2 Migration from UGE to SLURM

    -

    For long term users who started off with Grid Engine here are some resources to make a transition +

    A.2 Migration from UGE to SLURM

    +

    For long term users who started off with Grid Engine here are some resources to make a transition and mapping to the job submission process.

    - - -
    • -

      Queues are called “partitions” in SLURM. Our mapping from the GE queues to SLURM +

      Queues are called “partitions” in SLURM. Our mapping from the GE queues to SLURM partitions is as follows:

      -
      -     GE  => SLURM
      -     s.q    ps
      -     g.q    pg
      -     a.q    pa
      -
      -

      We also have a new partition pt that covers SPEED2 nodes, which previously did not +

      +         GE  => SLURM
      +         s.q    ps
      +         g.q    pg
      +         a.q    pa
      +

      We also have a new partition pt that covers SPEED2 nodes, which previously did not exist.

    • -

      Commands and command options mappings are found in Figure 14 from
      https://slurm.schedmd.com/rosetta.pdf
      https://slurm.schedmd.com/pdfs/summary.pdf
      Other related helpful resources from similar organizations who either used SLURM for a while +

      Commands and command options mappings are found in Figure 13 from:
      https://slurm.schedmd.com/rosetta.pdf
      https://slurm.schedmd.com/pdfs/summary.pdf
      Other related helpful resources from similar organizations who either used SLURM for a while or also transitioned to it:
      https://docs.alliancecan.ca/wiki/Running_jobs
      https://www.depts.ttu.edu/hpcc/userguides/general_guides/Conversion_Table_1.pdf
      https://docs.mpcdf.mpg.de/doc/computing/clusters/aux/migration-from-sge-to-slurm

      - PIC -
      Figure 14: Rosetta Mappings of Scheduler Commands from SchedMD
      + PIC +
      Figure 13: Rosetta Mappings of Scheduler Commands from SchedMD
    • -

      NOTE: If you have used UGE commands in the past you probably still have these lines there; +

      NOTE: If you have used UGE commands in the past you probably still have these lines there; they should now be removed, as they have no use in SLURM and will start giving “command not found” errors on login when the software is removed: -

      csh/tcsh: sample .tcshrc file: +

      csh/tcsh: sample .tcshrc file:

      -
      -     # Speed environment set up
      -     if ($HOSTNAME == speed-submit.encs.concordia.ca) then
      -        source /local/pkg/uge-8.6.3/root/default/common/settings.csh
      -     endif
      -
      -

      -

      Bourne shell/bash: sample .bashrc file: +

      +         # Speed environment set up
      +         if ($HOSTNAME == speed-submit.encs.concordia.ca) then
      +         source /local/pkg/uge-8.6.3/root/default/common/settings.csh
      +         endif
      +

      +

      Bourne shell/bash: sample .bashrc file:

      -
      -     # Speed environment set up
      -     if [ $HOSTNAME = "speed-submit.encs.concordia.ca" ]; then
      -         . /local/pkg/uge-8.6.3/root/default/common/settings.sh
      -         printenv ORGANIZATION | grep -qw ENCS || . /encs/Share/bash/profile
      -     fi
      -
      -

      -

      IMPORTANT NOTE: you will need to either log out and back in, or execute a new +

      +         # Speed environment set up
      +         if [ $HOSTNAME = "speed-submit.encs.concordia.ca" ]; then
      +             . /local/pkg/uge-8.6.3/root/default/common/settings.sh
      +             printenv ORGANIZATION | grep -qw ENCS || . /encs/Share/bash/profile
      +         fi
      +

      +

      IMPORTANT NOTE: you will need to either log out and back in, or execute a new shell, for the environment changes in the updated .tcshrc or .bashrc file to be - applied. -

      -
    -

    + applied.

    +

    -

    A.3 Phases

    -

    Brief summary of Speed evolution phases. -

    +

    A.3 Phases

    +

    Brief summary of Speed evolution phases: +

    -
    A.3.1 Phase 5
    -

    Phase 5 saw incorporation of the Salus, Magic, and Nebular subclusters (see Figure 2). -

    +

    A.3.1 Phase 5
    +

    Phase 5 saw incorporation of the Salus, Magic, and Nebular subclusters (see Figure 2). +

    -
    A.3.2 Phase 4
    -

    Phase 4 had 7 SuperMicro servers with 4x A100 80GB GPUs each added, dubbed as “SPEED2”. We +

    A.3.2 Phase 4
    +

    Phase 4 had 7 SuperMicro servers with 4x A100 80GB GPUs each added, dubbed as “SPEED2”. We also moved from Grid Engine to SLURM. -

    +

    -
    A.3.3 Phase 3
    -

    Phase 3 had 4 vidpro nodes added from Dr. Amer totalling 6x P6 and 6x V100 GPUs +

    A.3.3 Phase 3
    +

    Phase 3 had 4 vidpro nodes added from Dr. Amer totalling 6x P6 and 6x V100 GPUs added. -

    +

    -
    A.3.4 Phase 2
    -

    Phase 2 saw 6x NVIDIA Tesla P6 added and 8x more compute nodes. The P6s replaced 4x of FirePro +

    A.3.4 Phase 2
    +

    Phase 2 saw 6x NVIDIA Tesla P6 added and 8x more compute nodes. The P6s replaced 4x of FirePro S7150. -

    -

    -
    A.3.5 Phase 1
    -

    Phase 1 of Speed was of the following configuration: +

    +
    A.3.5 Phase 1
    +

    Phase 1 of Speed was of the following configuration:

    • Sixteen, 32-core nodes, each with 512 GB of memory and approximately 1 TB of volatile-scratch disk space. @@ -2701,84 +2728,81 @@
      A.3.5

    -

    B Frequently Asked Questions

    +

    B Frequently Asked Questions

    -

    B.1 Where do I learn about Linux?

    +

    B.1 Where do I learn about Linux?

    All Speed users are expected to have a basic understanding of Linux and its commonly used commands. Here are some recommended resources:

    -

    Software Carpentry - Software Carpentry provides free resources to learn software, including a workshop on the Unix -shell. Visit Software Carpentry Lessons to learn more. +

    Software Carpentry +: Software Carpentry provides free resources to learn software, including a workshop on the Unix shell. +Visit Software Carpentry Lessons to learn more.

    -

    Udemy - There are numerous Udemy courses, including free ones, that will help you learn Linux. +

    Udemy +: There are numerous Udemy courses, including free ones, that will help you learn Linux. Active Concordia faculty, staff and students have access to Udemy courses. A recommended starting point for beginners is the course “Linux Mastery: Master the Linux Command Line in 11.5 Hours”. Visit Concordia’s Udemy page to learn how Concordians can access Udemy. -

    +

    -

    B.2 How to use bash shell on Speed?

    -

    This section provides comprehensive instructions on how to utilize the bash shell on the Speed +

    B.2 How to use bash shell on Speed?

    +

    This section provides comprehensive instructions on how to utilize the bash shell on the Speed cluster. -

    +

    -
    B.2.1 How do I set bash as my login shell?
    -

    To set your default login shell to bash on Speed, your login shell on all GCS servers must be changed +

    B.2.1 How do I set bash as my login shell?
    +

    To set your default login shell to bash on Speed, your login shell on all GCS servers must be changed to bash. To make this change, create a ticket with the Service Desk (or email help at concordia.ca) to request that bash become your default login shell for your ENCS user account on all GCS servers. -

    +

    -
    B.2.2 How do I move into a bash shell on Speed?
    -

    To move to the bash shell, type bash at the command prompt: +

    B.2.2 How do I move into a bash shell on Speed?
    +

    To move to the bash shell, type bash at the command prompt:

    -
    +
     [speed-submit] [/home/a/a_user] > bash
     bash-4.4$ echo $0
    -bash
    -
    -

    -

    Note how the command prompt changes from “[speed-submit] [/home/a/a_user] >” to +bash

    +

    Note how the command prompt changes from “[speed-submit] [/home/a/a_user] >” to “bash-4.4$” after entering the bash shell. -

    -

    -
    B.2.3 How do I use the bash shell in an interactive session on Speed?
    -

    Below are examples of how to use bash as a shell in your interactive job sessions with both the salloc -and srun commands. +

    +
    B.2.3 How do I use the bash shell in an interactive session on Speed?
    +

    Below are examples of how to use bash as a shell in your interactive job sessions with both the salloc +and srun commands.

    • salloc -ppt --mem=100G -N 1 -n 10 /encs/bin/bash
    • srun --mem=50G -n 5 --pty /encs/bin/bash
    -

    Note: Make sure the interactive job requests memory, cores, etc. -

    +

    Note: Make sure the interactive job requests memory, cores, etc. +

    -
    B.2.4 How do I run scripts written in bash on Speed?
    -

    To execute bash scripts on Speed: +

    B.2.4 How do I run scripts written in bash on Speed?
    +

    To execute bash scripts on Speed:

    1. Ensure that the shebang of your bash job script is #!/encs/bin/bash
    2. Use the sbatch command to submit your job script to the scheduler.
    -

    Check Speed GitHub for a sample bash job script. -

    +

    Check Speed GitHub for a sample bash job script. +

    -

    B.3 How to resolve “Disk quota exceeded” errors?

    -

    +

    B.3 How to resolve “Disk quota exceeded” errors?

    +

    -
    B.3.1 Probable Cause
    -

    The “Disk quota exceeded” error occurs when your application has run out of disk space to write +

    B.3.1 Probable Cause
    +

    The “Disk quota exceeded” error occurs when your application has run out of disk space to write to. On Speed, this error can be returned when:

    1. The NFS-provided home is full and cannot be written to. You can verify this using the @@ -2786,160 +2810,154 @@
      B.3.1
    2. The “/tmp” directory on the speed node where your application is running is full and cannot be written to.
    -

    +

    -
    B.3.2 Possible Solutions
    -

    +

    B.3.2 Possible Solutions
    +

    1. Use the --chdir job script option to set the job working directory. This is the directory where the job will write output files.
    2. -

      Although local disk space is recommended for IO-intensive operations, the ‘/tmp’ directory on +

      Although local disk space is recommended for IO-intensive operations, the ‘/tmp’ directory on Speed nodes is limited to 1TB, so it may be necessary to store temporary data elsewhere. Review the documentation for each module used in your script to determine how to set working + + + directories. The basic steps are:

      • Determine how to set working directories for each module used in your job script.
      • -

        Create a working directory in speed-scratch for output files: +

        Create a working directory in speed-scratch for output files:

        -
        -         mkdir -m 750 /speed-scratch/$USER/output
        -
        -

        +

        +         mkdir -m 750 /speed-scratch/$USER/output
        +

      • -

        Create a subdirectory for recovery files: +

        Create a subdirectory for recovery files:

        -
        -         mkdir -m 750 /speed-scratch/$USER/recovery
        -
        -

        +

        +         mkdir -m 750 /speed-scratch/$USER/recovery
        +

      • Update the job script to write output to the directories created in your speed-scratch directory, e.g., /speed-scratch/$USER/output.
    -

    In the above example, $USER is an environment variable containing your ENCS username. -

    +

    In the above example, $USER is an environment variable containing your ENCS username. +

    -
    B.3.3 Example of setting working directories for COMSOL
    +
    B.3.3 Example of setting working directories for COMSOL
    • -

      Create directories for recovery, temporary, and configuration files. +

      Create directories for recovery, temporary, and configuration files.

      -
      -     mkdir -m 750 -p /speed-scratch/$USER/comsol/{recovery,tmp,config}
      -
      -

      +

      +     mkdir -m 750 -p /speed-scratch/$USER/comsol/{recovery,tmp,config}
      +

    • -

      Add the following command switches to the COMSOL command to use the directories created +

      Add the following command switches to the COMSOL command to use the directories created above:

      -
      +     
            -recoverydir /speed-scratch/$USER/comsol/recovery
            -tmpdir /speed-scratch/$USER/comsol/tmp
      -     -configuration/speed-scratch/$USER/comsol/config
      -
      -

    -

    In the above example, $USER is an environment variable containing your ENCS username. -

    + -configuration/speed-scratch/$USER/comsol/config

    +

    +

    In the above example, $USER is an environment variable containing your ENCS username. +

    -
    B.3.4 Example of setting working directories for Python Modules
    -

    By default when adding a Python module, the /tmp directory is set as the temporary repository for +

    B.3.4 Example of setting working directories for Python Modules
    +

    By default when adding a Python module, the /tmp directory is set as the temporary repository for files downloads. The size of the /tmp directory on speed-submit is too small for PyTorch. To add a Python module

    • -

      Create your own tmp directory in your speed-scratch directory: +

      Create your own tmp directory in your speed-scratch directory:

      -
      -        mkdir /speed-scratch/$USER/tmp
      -
      -

      +

      +        mkdir /speed-scratch/$USER/tmp
      +

    • -

      Use the temporary directory you created +

      Use the temporary directory you created

      -
      -        setenv TMPDIR /speed-scratch/$USER/tmp
      -
      -

      +

      +        setenv TMPDIR /speed-scratch/$USER/tmp
      +

    • Attempt the installation of PyTorch
    -

    In the above example, $USER is an environment variable containing your ENCS username. -

    +

    In the above example, $USER is an environment variable containing your ENCS username. +

    -

    B.4 How do I check my job’s status?

    -

    When a job with a job ID of 1234 is running or terminated, you can track its status using the -following commands:

    +

    B.4 How do I check my job’s status?

    +

    When a job with a job ID of 1234 is running or terminated, you can track its status using the +following commands to check its status:

    • -

      Use the “sacct” command to view the status of a job: +

      Use the “sacct” command to view the status of a job:

      -
      -     sacct -j 1234
      -
      -

      +

      +     sacct -j 1234
      +

    • -

      Use the “squeue” command to see if the job is sitting in the queue: +

      Use the “squeue” command to see if the job is sitting in the queue:

      -
      -     squeue -j 1234
      -
      -

      +

      +     squeue -j 1234
      +

    • -

      Use the “sstat” command to find long-term statistics on the job after it has terminated and the +

      Use the “sstat” command to find long-term statistics on the job after it has terminated and the slurmctld has purged it from its tracking state into the database:

      -
      -     sstat -j 1234
      -
      -

    -

    +

    +     sstat -j 1234
    +

    +

    -

    B.5 Why is my job pending when nodes are empty?

    -

    +

    B.5 Why is my job pending when nodes are empty?

    +

    -
    B.5.1 Disabled nodes
    -

    It is possible that one or more of the Speed nodes are disabled for maintenance. To verify if Speed +

    B.5.1 Disabled nodes
    +

    It is possible that one or more of the Speed nodes are disabled for maintenance. To verify if Speed nodes are disabled, check if they are in a draining or drained state:

    -
    +
     [serguei@speed-submit src] % sinfo --long --Node
     Thu Oct 19 21:25:12 2023
     NODELIST   NODES PARTITION       STATE CPUS    S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON
    @@ -2979,34 +2997,31 @@ 
    B.5.1 speed-40       1        pt        idle 256    2:64:2 980275        0      1 gpu20,mi none speed-41       1        pt        idle 256    2:64:2 980275        0      1 gpu20,mi none speed-42       1        pt        idle 256    2:64:2 980275        0      1 gpu20,mi none -speed-43       1        pt        idle 256    2:64:2 980275        0      1 gpu20,mi none -
    -

    -

    Note which nodes are in the state of drained. The reason for the drained state can be found in the -reason column.
    -

    Your job will run once an occupied node becomes availble or the maintenance is completed, and the -disabled nodes have a state of idle. +speed-43       1        pt        idle 256    2:64:2 980275        0      1 gpu20,mi none

    +

    +

    Note which nodes are in the state of drained. The reason for the drained state can be found in the +reason column. Your job will run once an occupied node becomes availble or the maintenance is +completed, and the disabled nodes have a state of idle. -

    +

    -
    B.5.2 Error in job submit request.
    -

    It is possible that your job is pending because it requested resources that are not available within +

    B.5.2 Error in job submit request.
    +

    It is possible that your job is pending because it requested resources that are not available within Speed. To verify why job ID 1234 is not running, execute:

    -
    -sacct -j 1234
    -
    -

    -

    A summary of the reasons can be obtained via the squeue command. -

    +

    +sacct -j 1234
    +

    +

    A summary of the reasons can be obtained via the squeue command. +

    -

    C Sister Facilities

    -

    Below is a list of resources and facilities similar to Speed at various capacities. Depending on your +

    C Sister Facilities

    +

    Below is a list of resources and facilities similar to Speed at various capacities. Depending on your research group and needs, they might be available to you. They are not managed by HPC/NAG of AITS, so contact their respective representatives.

    @@ -3017,7 +3032,7 @@

    C https://www.concordia.ca/ginacody/aits/public-servers.html.
  • apini.encs cluster for teaching and MPI programming (see the corresponding course in - CSSE), managed by CSSE + CSSE), managed by CSSE.
  • Computer Science and Software Engineering (CSSE) Virya GPU Cluster. For CSSE members only. The cluster has 4 nodes with total of 32 NVIDIA GPUs (a mix of V100s @@ -3028,14 +3043,12 @@

    C
  • Dr. Maria Amer’s VidPro group’s nodes in Speed (-01, -03, -25, -27) with additional V100 and P6 GPUs. - - -
  • -

    There are various Lambda Labs other GPU servers and like computers acquired by individual - researchers; if you are member of their research group, contact them directly. These resources - are not managed by us.

    +

    There are various Lambda Labs other GPU servers and like computers acquired by + individual researchers; if you are member of their research group, contact them directly. + These resources are not managed by us. +

    • Dr. Amin Hammad’s construction.encs Lambda Labs station
    • @@ -3061,34 +3074,34 @@

      C Digital Research Alliance Canada (Compute Canada / Calcul Quebec),
      https://alliancecan.ca/. Follow this link on the information how to obtain access (students need to be sponsored by their supervising faculty members, who should create accounts first). Their SLURM examples are here: https://docs.alliancecan.ca/wiki/Running_jobs - - -

    -

    D Software Installed On Speed

    -

    This is a generated section by a script; last updated on Tue Jul 23 10:48:52 PM EDT 2024. We have +

    D Software Installed On Speed

    +

    This is a generated section by a script; last updated on Fri Dec 20 04:37:31 PM EST 2024. We have two major software trees: Scientific Linux 7 (EL7), which is outgoing, and AlmaLinux 9 (EL9). After major synchronization of software packages is complete, we will stop maintaining the EL7 tree and will migrate the remaining nodes to EL9. -

    Use --constraint=el7 to select EL7-only installed nodes for their software packages. Conversely, -use --constraint=el9 for the EL9-only software. These options would be used as a part of your job +

    Use --constraint=el7 to select EL7-only installed nodes for their software packages. Conversely, use +--constraint=el9 for the EL9-only software. These options would be used as a part of your job parameters in either #SBATCH or on the command line.

    NOTE: this list does not include packages installed directly on the OS (yet).

    -

    D.1 EL7

    +

    D.1 EL7

    Not all packages are intended for HPC, but the common tree is available on Speed as well as teaching labs’ desktops.

    -
    +
    -
    -

    +

    +

    -

    D.2 EL9

    -
    +

    D.2 EL9

    +
    -
    +
    -

    -

    References

    +

    +

    References

    @@ -5637,10 +5661,10 @@

    D.2

    [9]   L. Drummond, H. Banh, N. Ouedraogo, H. Ho, and E. Essel. Effects of nozzle convergence - angle on the flow characteristics of a synthetic circular jet in a crossflow. In Bulletin of the + angle on the flow characteristics of a synthetic circular jet in a crossflow. In Bulletin of the American Physical Society, editor, 76th Annual Meeting of the Division of Fluid Dynamics, November 2023.

    @@ -5689,10 +5713,10 @@

    D.2 [18]   Serguei A. Mokhov. Intensional Cyberforensics. PhD thesis, Department of Computer Science and Software Engineering, Concordia University, Montreal, Canada, September 2013. Online at http://arxiv.org/abs/1312.0466. -

    +

    [19]   Serguei A. Mokhov, Michael J. Assels, Joey Paquet, and Mourad Debbabi. Automating MAC spoofer evidence gathering and encoding for investigations. In Frederic Cuppens et al., editors, @@ -5705,63 +5729,58 @@

    D.2

    - [21]   Serguei A. Mokhov and Scott Bunnell. Speed server farm: - Gina Cody School of ENCS HPC facility. [online], 2018–2019. - https://docs.google.com/presentation/d/1bWbGQvYsuJ4U2WsfLYp8S3yb4i7OdU7QDn3l_Q9mYis. -

    -

    - [22]   Serguei A. Mokhov, Joey Paquet, and Mourad Debbabi. The use of NLP techniques in static + [21]   Serguei A. Mokhov, Joey Paquet, and Mourad Debbabi. The use of NLP techniques in static code analysis to detect weaknesses and vulnerabilities. In Maria Sokolova and Peter van Beek, editors, Proceedings of Canadian Conference on AI’14, volume 8436 of LNAI, pages 326–332. Springer, May 2014. Short paper.

    - [23]   Parna Niksirat, Adriana Daca, and Krzysztof Skonieczny. The effects of reduced-gravity + [22]   Parna Niksirat, Adriana Daca, and Krzysztof Skonieczny. The effects of reduced-gravity on planetary rover mobility. International Journal of Robotics Research, 39(7):797–811, 2020. https://doi.org/10.1177/0278364920913945.

    - [24]   N. Ouedraogo, A. Cyrus, and E. Essel. Effects of Reynolds number on the wake characteristics + [23]   N. Ouedraogo, A. Cyrus, and E. Essel. Effects of Reynolds number on the wake characteristics of a Notchback Ahmed body. In Bulletin of the American Physical Society, editor, 76th Annual Meeting of the Division of Fluid Dynamics, November 2023.

    - [25]   Newton F. Ouedraogo and Ebenezer E. Essel. Unsteady wake interference of unequal-height + [24]   Newton F. Ouedraogo and Ebenezer E. Essel. Unsteady wake interference of unequal-height tandem cylinders mounted in a turbulent boundary layer. Journal of Fluid Mechanics, 977:A52, 2023. https://doi.org/10.1017/jfm.2023.952.

    - [26]   Newton F. Ouedraogo and Ebenezer E. Essel. Effects of Reynolds number on the wake + [25]   Newton F. Ouedraogo and Ebenezer E. Essel. Effects of Reynolds number on the wake characteristics of a Notchback Ahmed body. Journal of Fluids Engineering, 146(11):111302, 05 2024.

    - [27]   Chet Ramey. The Bourne-Again Shell. In Brown and Wilson [7]. + [26]   Chet Ramey. The Bourne-Again Shell. In Brown and Wilson [7]. http://aosabook.org/en/bash.html.

    - [28]   Farshad Rezaei and Marius Paraschivoiu. Placing a small-scale vertical axis wind turbine on + [27]   Farshad Rezaei and Marius Paraschivoiu. Placing a small-scale vertical axis wind turbine on roof-top corner of a building. In Proceedings of the CSME International Congress, June 2022. - - - https://doi.org/10.7939/r3-j7v7-m909.

    - [29]   Farshad Rezaei and Marius Paraschivoiu. Computational challenges of simulating vertical axis + [28]   Farshad Rezaei and Marius Paraschivoiu. Computational challenges of simulating vertical axis + + + wind turbine on the roof-top corner of a building. Progress in Canadian Mechanical Engineering, 6, 1–6 2023. http://hdl.handle.net/11143/20861.

    - [30]   Rob Schreiber. MATLAB. Scholarpedia, 2(6):2929, 2007. + [29]   Rob Schreiber. MATLAB. Scholarpedia, 2(6):2929, 2007. http://www.scholarpedia.org/article/MATLAB.

    - [31]   The MARF Research and Development Group. The Modular Audio Recognition + [30]   The MARF Research and Development Group. The Modular Audio Recognition Framework and its Applications. [online], 2002–2014. http://marf.sf.net and http://arxiv.org/abs/0905.1235, last viewed May 2015.

    - + \ No newline at end of file diff --git a/doc/web/speed-manual.css b/doc/web/speed-manual.css index bb79f74..ed5ead9 100644 --- a/doc/web/speed-manual.css +++ b/doc/web/speed-manual.css @@ -10,9 +10,6 @@ .cmtt-9{font-family: monospace,monospace;} .cmtt-9{font-family: monospace,monospace;} .cmtt-9{font-family: monospace,monospace;} -.cmtt-9{font-family: monospace,monospace;} -.cmtt-9{font-family: monospace,monospace;} -.cmtt-9{font-family: monospace,monospace;} .cmbx-10{ font-weight: bold;} .cmbx-10{ font-weight: bold;} .cmbx-10{ font-weight: bold;} @@ -25,69 +22,32 @@ .cmtt-10{font-family: monospace,monospace;} .cmtt-10{font-family: monospace,monospace;} .cmtt-10{font-family: monospace,monospace;} -.cmtt-10{font-family: monospace,monospace;} -.cmtt-10{font-family: monospace,monospace;} -.cmtt-10{font-family: monospace,monospace;} -.tctt-1000{font-family: monospace,monospace;} .tctt-1000{font-family: monospace,monospace;} .tctt-1000{font-family: monospace,monospace;} .tctt-1000{font-family: monospace,monospace;} .cmti-10{ font-style: italic;} -.cmitt-10{font-family: monospace,monospace; font-style: italic;} -.cmitt-10{font-family: monospace,monospace; font-style: italic;} -.cmitt-10{font-family: monospace,monospace; font-style: italic;} -.cmitt-10{font-family: monospace,monospace; font-style: italic;} .cmtt-8x-x-87{font-size:69%;font-family: monospace,monospace;} .cmtt-8x-x-87{font-family: monospace,monospace;} .cmtt-8x-x-87{font-family: monospace,monospace;} .cmtt-8x-x-87{font-family: monospace,monospace;} .cmtt-8x-x-87{font-family: monospace,monospace;} .cmtt-8x-x-87{font-family: monospace,monospace;} -.cmtt-8x-x-87{font-family: monospace,monospace;} -.cmtt-8x-x-87{font-family: monospace,monospace;} -.cmtt-8x-x-87{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-size:49%;font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmtt-8x-x-62{font-family: monospace,monospace;} -.cmitt-10x-x-50{font-size:50%;font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-50{font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-50{font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-50{font-family: monospace,monospace; font-style: italic;} -.tctt-0800x-x-62{font-size:49%;font-family: monospace,monospace;} -.tctt-0800x-x-62{font-family: monospace,monospace;} -.tctt-0800x-x-62{font-family: monospace,monospace;} -.tctt-0800x-x-62{font-family: monospace,monospace;} .cmitt-10x-x-70{font-size:70%;font-family: monospace,monospace; font-style: italic;} .cmitt-10x-x-70{font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-70{font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-70{font-family: monospace,monospace; font-style: italic;} .tctt-0800x-x-87{font-size:69%;font-family: monospace,monospace;} .tctt-0800x-x-87{font-family: monospace,monospace;} .tctt-0800x-x-87{font-family: monospace,monospace;} -.tctt-0800x-x-87{font-family: monospace,monospace;} .cmtt-8{font-size:80%;font-family: monospace,monospace;} .cmtt-8{font-family: monospace,monospace;} .cmtt-8{font-family: monospace,monospace;} .cmtt-8{font-family: monospace,monospace;} .cmtt-8{font-family: monospace,monospace;} .cmtt-8{font-family: monospace,monospace;} -.cmtt-8{font-family: monospace,monospace;} -.cmtt-8{font-family: monospace,monospace;} -.cmtt-8{font-family: monospace,monospace;} .cmitt-10x-x-80{font-size:80%;font-family: monospace,monospace; font-style: italic;} .cmitt-10x-x-80{font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-80{font-family: monospace,monospace; font-style: italic;} -.cmitt-10x-x-80{font-family: monospace,monospace; font-style: italic;} .tctt-0800{font-size:80%;font-family: monospace,monospace;} .tctt-0800{font-family: monospace,monospace;} .tctt-0800{font-family: monospace,monospace;} -.tctt-0800{font-family: monospace,monospace;} .cmti-9{font-size:90%; font-style: italic;} p{margin-top:0;margin-bottom:0} p.indent{text-indent:0;} @@ -239,8 +199,6 @@ td.gather {text-align:center; } table.gather {width:100%;} div.gather-star {text-align:center;} figure.figure { text-align: left; } -.columns-2 :first-child { margin-top: 0em; } -.columns-2{-moz-column-count:2; -webkit-column-count:2; column-count:2; -moz-column-gap: 10.0pt; -webkit-column-gap: 10.0pt; column-gap: 10.0pt; -moz-column-rule-width: 0.0pt; -webkit-column-rule-width: 0.0pt; column-rule-rule-width: 0.0pt; -moz-column-rule-color: #555; -webkit-column-rule-color: #555; column-rule-color: #555; -moz-column-rule-style: outset; -webkit-column-rule-style: outset; column-rule-style: outset; -moz-column-fill: balance; -webkit-column-fill: balance; column-fill: balance; } .columns-3 :first-child { margin-top: 0em; } .columns-3{-moz-column-count:3; -webkit-column-count:3; column-count:3; -moz-column-gap: 10.0pt; -webkit-column-gap: 10.0pt; column-gap: 10.0pt; -moz-column-rule-width: 0.0pt; -webkit-column-rule-width: 0.0pt; column-rule-rule-width: 0.0pt; -moz-column-rule-color: #555; -webkit-column-rule-color: #555; column-rule-color: #555; -moz-column-rule-style: outset; -webkit-column-rule-style: outset; column-rule-style: outset; -moz-column-fill: balance; -webkit-column-fill: balance; column-fill: balance; } /* end css.sty */ diff --git a/software-list.md b/software-list.md index 0cdf061..5592387 100644 --- a/software-list.md +++ b/software-list.md @@ -1,7 +1,7 @@ -# Software Installed On Speed {#sect:software-details} +# Software Installed On Speed {#sect:software-list} -This is a generated section by a script; last updated on *Tue Jul 23 -10:48:52 PM EDT 2024*. We have two major software trees: Scientific +This is a generated section by a script; last updated on *Fri Dec 20 +04:37:31 PM EST 2024*. We have two major software trees: Scientific Linux 7 (EL7), which is outgoing, and AlmaLinux 9 (EL9). After major synchronization of software packages is complete, we will stop maintaining the EL7 tree and will migrate the remaining nodes to EL9. @@ -1287,6 +1287,8 @@ on Speed as well as teaching labs' desktops. - `OpenFOAM-2.3.1` +- `OpenFOAM-2.4.0` + - `OpenFOAM-3.0.1` - `OpenFOAM-5.0` @@ -2005,6 +2007,8 @@ on Speed as well as teaching labs' desktops. - `DbVisualizer-24.1.5` +- `EasyBuild` + - `emacs-27.2` - `expect-5.45.4` @@ -2067,6 +2071,8 @@ on Speed as well as teaching labs' desktops. - `firefox_french-91.9.1` +- `gcc-11.3.0` + - `gcc-12.2.0` - `gcc-4.9.2` @@ -2141,6 +2147,10 @@ on Speed as well as teaching labs' desktops. - `matlab-R2023b` +- `matlab-R2024a` + +- `matlab-R2024b` + - `mesa-19.0.3` - `modules-3.2.10` @@ -2195,6 +2205,10 @@ on Speed as well as teaching labs' desktops. - `OpenFOAM-11.0` +- `OpenFOAM-12.0` + +- `OpenFOAM-2.4.0` + - `OpenFOAM-8.0` - `OpenFOAM-v2012`