forked from PDC-support/introduction-to-pdc-retired
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobs.tex
129 lines (108 loc) · 4.25 KB
/
jobs.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
\section{Running jobs}
\frame{
\frametitle{How to run programs}
\begin{itemize}
\item After login we are on a \textit{login node} used only for:
\begin{itemize}
\item submitting jobs,
\item editing files,
\item compiling small programs,
\item other computationally light tasks.
\end{itemize}
\item \alert{Never run calculations interactively on the login node}
\item Instead, request compute resources \textit{interactively} or via \textit{batch script} \\~
\item All jobs must be connected to a time allocation
\item For courses, PDC sets up a \textit{reservation} for resources \\~
\item To manage the workload on the clusters, PDC uses a queueing/batch system
\end{itemize}
}
\subsection{SLURM}
\frame{
\frametitle{SLURM workload manager}
\framesubtitle{Simple Linux Utility for Resource Management}
\begin{itemize}
\item Open source, fault-tolerant, and highly scalable cluster management and job scheduling system
\begin{itemize}
\item \alert{Allocates} exclusive and/or non-exclusive access to \alert{resources} for some duration of time
\item Provides a framework for \alert{starting}, \alert{executing}, and \alert{monitoring} work on the set of allocated nodes
\item \alert{Arbitrates contention} for resources by managing a queue
\end{itemize}
\item Job Priority computed based on
\begin{description}
\item [Age] the length of time a job has been waiting
\item [Fair-share] the difference between the portion of the computing resource that has been promised and the amount of resources that has been consumed
\item [Job size] the number of nodes or CPUs a job is allocated
\item [Partition] a factor associated with each node partition
% \item [QOS] a factor associated with each Quality Of Service
\end{description}
\end{itemize}
}
\subsection*{SLURM commands}
\begin{frame}[fragile]
\frametitle{Interactive session \hfill \alert{\textbf{salloc}}}
\begin{exampleblock}{Request an interactive allocation of resources}
\begin{verbatim}
$ salloc -A <account> -t <d-hh:mm:ss> -N <nodes>
salloc: Granted job allocation 123456
\end{verbatim}
\end{exampleblock}
\begin{exampleblock}{Run application on \alert{\textbf{Beskow}}}
\begin{verbatim}
$ srun -n <PEs> ./binary.x
#PEs - number of processing elements (MPI processes)
\end{verbatim}
\end{exampleblock}
\begin{exampleblock}{Run application on \alert{\textbf{Tegner}}}
\begin{verbatim}
$ mpirun -np <cores> ./binary.x
\end{verbatim}
\end{exampleblock}
\end{frame}
\begin{frame}[fragile]
\frametitle{Launch batch jobs \hfill \alert{\textbf{sbatch}}}
\begin{exampleblock}{Submit the job to SLURM queue}
\begin{verbatim}
$ sbatch <script>
Submitted batch job 958287
\end{verbatim}
\end{exampleblock}
\scriptsize
The script should contain all necessary data to identify the account and requested resources
\begin{exampleblock}{Example of request to run myexe for 1 hour on 4 nodes}
\begin{verbatim}
#!/bin/bash -l
#SBATCH -A summer-2019
#SBATCH -J myjob
#SBATCH -t 1:00:00
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=32
#SBATCH -e error_file.e
#SBATCH -o output_file.o
srun -n 128 ./myexe > my_output_file
\end{verbatim}
\end{exampleblock}
\end{frame}
\begin{frame}[fragile]
\frametitle{Monitoring and/or cancelling running jobs }
\begin{alertblock}{\textbf{squeue} -u \$USER}
Displays all queue and/or running jobs that belong to the user
\tiny
\begin{verbatim}
cira@beskow-login2:~> squeue -u cira
JOBID USER ACCOUNT NAME ST REASON START_TIME TIME TIME_LEFT NODES CPUS
957519 cira pdc.staff VASP-test R None 2016-08-15T08:15:24 6:09:42 17:49:18 16 1024
957757 cira pdc.staff VASP-run R None 2016-08-15T11:14:20 3:10:46 20:48:14 128 8192
\end{verbatim}
\end{alertblock}
\begin{alertblock}{\textbf{scancel} [job]}
Stops a running job or removes a pending one from the queue
\tiny
\begin{verbatim}
cira@beskow-login2:~> scancel 957519
salloc: Job allocation 957891 has been revoked.
cira@beskow-login2:~> squeue -u cira
JOBID USER ACCOUNT NAME ST REASON START_TIME TIME TIME_LEFT NODES CPUS
957757 cira pdc.staff VASP-run R None 2016-08-15T11:14:20 3:10:46 20:48:14 128 8192
\end{verbatim}
\end{alertblock}
\end{frame}