\documentclass[english,serif,mathserif,xcolor=pdftex,dvipsnames,table]{beamer}
\usepackage[utf8]{inputenc}
%\usepackage[T1]{fontenc}
\usepackage{babel}
\usepackage{fixltx2e}
\usepackage{graphicx}
\usepackage{colortbl}%
  %\newcommand{\cellcolor}[2]{\multicolumn{1}{>{\columncolor{#1}}c}{#2}}
\usepackage{listings}%
  \lstloadlanguages{sh}%
  \lstset{
    language=sh,%
    % --- basic appearance ---
    basicstyle=\ttfamily,%
    %columns=fullflexible,% best results for proportional fonts
    commentstyle=\small,%
    keywordstyle=\bfseries,% or \normalfont
    %identifierstyle=\itshape,%
    %procnamestyle=\bfseries\slshape, %\scshape,%
    %procnamekeys={def},%
    % --- escaping and special displays ---
    escapechar=@,% text between "@" will be rendered as normal TeX
    %moredelim=[il][\small\itshape]{\#},% ditto for text beween "#" and end-of-line
    texcl,%
    mathescape=false,%
    %literate={*{=}{{$\gets$ }}1 {==}{{$=$ }}1 {<=}{{$\leq$ }}1 {>=}{{$\geq$ }}1 {!=}{{$\neq$ }}1},%
    % --- display ---
    %showspaces=false,%
    %showstringspaces=false,%
    %xleftmargin=2em,%
    % --- line numbers ---
    %numbers=left,%
    %numberstyle=\tiny,%
    %stepnumber=1,%
    %firstnumber=1%
  }%
  \lstMakeShortInline{@}%
\usepackage{longtable}
\usepackage{multirow}
\usepackage{float}
\usepackage{wrapfig}
\usepackage{soul}
\usepackage{textcomp}
\usepackage{tikz}%
  \usetikzlibrary{arrows,shapes}%
  % For every picture that defines or uses external nodes, you'll have to
  % apply the 'remember picture' style. To avoid some typing, we'll apply
  % the style to all pictures.
  \tikzstyle{every picture}+=[remember picture]%
\usepackage{marvosym}
\usepackage{wasysym}
\usepackage{latexsym}
\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{url}
\tolerance=1000
\providecommand{\alert}[1]{\textbf{#1}}

\usetheme{uzhneu-en-informal}


\title[Introduction to SGE/OGS]{%
  Introduction to the SGE/OGS \\
  batch-queuing system
}
\author[R. Murri]{%
  \textbf{Riccardo Murri} \\
  Grid Computing Competence Center, \\
  Organisch-Chemisches Institut, \\
  University of Zurich
}
\date{Oct.~6,~2011}

%% Use `\largeskip` to get a larger vertical white space between two
%% lines/paragraphs:
\newcommand{\largeskip}{\vspace{1em}}
\def\+{\largeskip}
\setlength{\parsep}{1.0em}

\begin{document}

% title frame
\maketitle


\section{Introduction}

\begin{frame}
  \frametitle{The basic problem}

  \textbf{Process a large set of data.}

  \+
  Assumptions:
  \begin{enumerate}
  \item Cannot be done on a single computer for space or time constraints.
  \item The data can be subdivided into \emph{files}, each of which
    can be \emph{processed independently}.
  \item (Processing each file can comprise several steps.)
  \item (Accessing the files over a network has acceptable overhead.)
  \end{enumerate}
\end{frame}


\begin{frame}
  \frametitle{Today's lab session}
  Two approaches:
  \begin{itemize}
  \item Local execution of programs (e.g., on your laptop)
  \item Batched execution of programs (on a cluster)
  \end{itemize}

  \+
  The goal of these initial lab sessions is to show what the
  difference is, in practice, and what tools are available in each
  case.

  \+
  {\small These slides are available for download from: 
    \url{http://www.gc3.uzh.ch/teaching/lsci2011/lab02.pdf}}
\end{frame}


\section{The cluster \texttt{ocikbpra.uzh.ch}}

\begin{frame}[fragile]
  \frametitle{Login to the cluster \texttt{ocikbpra.uzh.ch}}
  
  Log in to the cluster:
\begin{lstlisting}[escapechar=!]
  ssh !\it\small username!@ocikbpra.uzh.ch
\end{lstlisting}

  \+
  You should be greeted by this shell prompt:
\begin{lstlisting}[escapechar=!]
  [!\it\small username!@ocikbpra ~]$ 
\end{lstlisting}

  \+ 
  Gather the sample application and test files into a directory \texttt{lab2}:
\begin{lstlisting}
  mkdir lab2
  cp -av ~murri/lsci/rank-int.i386 lab2/
  cp -av ~murri/lsci/M0,6*.sms lab2/
  cd lab2
\end{lstlisting}
\end{frame}


\begin{frame}
  \frametitle{The cluster \texttt{ocikbpra.uzh.ch}}
  \begin{center}
    \includegraphics[width=0.9\linewidth]{lab02/ocikbpra}
  \end{center}
\end{frame}

\section{Local execution}

\begin{frame}
  \frametitle{Recap from Lab Session 1}
  Process control features offered by the GNU/Linux shell:
  \begin{itemize}
  \item background processes with the \texttt{\&} operator
  \item monitor process status with the \texttt{ps} command
  \item send signals to running processes with the \texttt{kill} command
  \end{itemize}

  \+
  {\small Lab Session 1 slides are available for download from: 
    \url{http://www.gc3.uzh.ch/teaching/lsci2011/lab01.pdf}}

\end{frame}

\begin{frame}
  \frametitle{Timing command execution, I}

  The command \texttt{/usr/bin/time} reports about the time spent by
  the system executing a command.

  \+
  Typical reports include:
  \begin{itemize}
  \item \emph{user} time: CPU time spent processing user-level code.
  \item \emph{system} time: CPU time spent processing kernel-level code.
  \item \emph{real}/\emph{elapsed} time: time from the start to the
    end of the program (as would have been measured by an external clock).
  \end{itemize}

  \+
  Quiz: can the CPU time be greater than the real/elapsed time?

\end{frame}

\begin{frame}
  \frametitle{Timing command execution, II}

  Exercises:
  \begin{enumerate}
  \item Using \texttt{man time}, figure out how to determine the CPU
    and real time spent running the command ``\texttt{rank-int.i386
      M0,6-D5.sms}''.
  \item Can \texttt{time} also report on the memory?  If yes, how much
    memory does the above command take?
  \end{enumerate}

\end{frame}

\begin{frame}[fragile]
  \frametitle{Timing command execution, III}
\begin{semiverbatim}\footnotesize
\alt<2>{
\begin{block}{Command-line to run}
  \color{red}{\$ /usr/bin/time ./rank-int.i386 M0,6-D5.sms}
\end{block}
}{\$ /usr/bin/time {\bf ./rank-int.i386 M0,6-D5.sms}}
\alt<3>{
\begin{block}{Command output}
  \color{red}{./rank-int.i386 file:M0,6-D5.sms rows:3024 cols:49800 nonzero:95760 rank:3024 cputime:0.005999 wctime:0.006570}
\end{block}
}{./rank-int.i386 file:M0,6-D5.sms rows:3024 cols:49800 \ldots}
\alt<4>{
\begin{block}{Timing information}\tiny
  {\color{red}0.10user 0.04system 0:00.18elapsed 80%CPU} (0avgtext+0avgdata 0maxresident)k
\end{block}
}{\alt<5>{
\begin{block}{Memory information}\tiny
  {0.10user 0.04system 0:00.18elapsed 80%CPU} \color{red}{(0avgtext+0avgdata 0maxresident)k}
\end{block}
}{0.10user 0.04system 0:00.18elapsed 80%CPU (0avgtext+0avgdata 0maxresident)k}}
\alt<6>{
\begin{block}{I/O and paging info}
  \color{red}{0inputs+0outputs (0major+1971minor)pagefaults 0swaps}
\end{block}
}{0inputs+0outputs (0major+1971minor)pagefaults 0swaps}
\end{semiverbatim}
\end{frame}


\begin{frame}
  \frametitle{Resource limits, I}

  Why impose limits on the utilization of system resources?

  \+
  What system ``resources'' would you want to limit in our case?
\end{frame}


\begin{frame}[fragile]
  \frametitle{Resource limits, II}
  
  The command \texttt{ulimit} allows setting \emph{resource usage
    limits}:
\begin{semiverbatim}\small
\$ {\bf ulimit -a}
core file size          (blocks, -c) 0
data seg size           (kbytes, -d) unlimited
{\tiny\em [\ldots]}
file size               (blocks, -f) unlimited
{\tiny\em [\ldots]}
max memory size         (kbytes, -m) unlimited
open files                      (-n) 1024
{\tiny\em [\ldots]}
stack size              (kbytes, -s) 10240
cpu time               (seconds, -t) unlimited
max user processes              (-u) 32767
virtual memory          (kbytes, -v) unlimited
{\tiny\em [\ldots]}
\end{semiverbatim}
\end{frame}


\begin{frame}[fragile]
  \frametitle{Resource limits, III}

  \emph{Warning:} The \textbf{ulimit} command is a shell built-in.  It
  takes immediate effect on \emph{all} the following commands.

  \+
  To restrict the scope to one command only, enclose it and
  \textbf{ulimit} in parentheses:
\begin{semiverbatim}\small
\$ (ulimit -t 15; ./rank-int.i386 M0,6-D8.sms)
\end{semiverbatim}

  \+
  (Parentheses force the enclosed commands to be executed in a sub-shell.)
\end{frame}


\begin{frame}
  \frametitle{Resource limits, IV}
  
  \emph{Exercises:}
  \begin{enumerate}
  \item What does the following command do?
\begin{semiverbatim}\small
\$ (ulimit -t 15; ./rank-int.i386 M0,6-D8.sms)
\end{semiverbatim}

    What happens if you leave out the \texttt{ulimit} part?

  \item What are the options given by \texttt{ulimit} for limiting memory?

  \item What should happen if you run the following command?  What
    really happens?
\begin{semiverbatim}\small
\$ (ulimit -m 102400; ./rank-int.i386 M0,6-D11.sms)
\end{semiverbatim}

  \item What should happen if you run the following command?  What
    really happens?
\begin{semiverbatim}\small
\$ (ulimit -v 102400; ./rank-int.i386 M0,6-D11.sms)
\end{semiverbatim}

  \end{enumerate}

\end{frame}


\section{The SGE/OGS batch system}

\begin{frame}
  \frametitle{SGE/OGS}
  \emph{Sun Grid Engine} (SGE) is a batch-queuing system produced by
  Sun Microcomputers; made open-source in 2001.

  \+ After acquisition by Oracle, the product forked:
  \begin{itemize}
  \item \emph{Open Grid Scheduler} (OGS), the open-source version
  \item \emph{Univa Grid Engine} is a commercial-only version,
    developed by the core SGE engineer team from Sun.
  \end{itemize}

  \+ 
  Used on UZH main HPC cluster ``Schroedinger''.
\end{frame}


\begin{frame}
  \frametitle{SGE architecture, I}
  \texttt{sge\_qmaster}
  \begin{itemize}
  \item Runs on \emph{master} node \texttt{ocikbpra.uzh.ch}
  \item Accepts client requests (job submission, job/host state inspection)
  \item Schedules jobs on compute nodes (formerly separate
    \texttt{sge\_schedd} process)
  \end{itemize}

  \+
  Client programs \texttt{qhost}, \texttt{qsub}, \texttt{qstat}
  \begin{itemize}
  \item Run by user on \emph{submit} node
  \item Clients for \texttt{sge\_qmaster}
  \item Master daemon has a list of authorized submit nodes
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{SGE architecture, II}
  \texttt{sge\_execd}
  \begin{itemize}
  \item Runs on every compute node
  \item Accepts job start requests from \texttt{sge\_qmaster}
  \item Monitors node status (load average, free memory, etc.) and
    reports back to \texttt{sge\_qmaster}
  \end{itemize}

  \+
  \texttt{sge\_shepherd}
  \begin{itemize}
  \item Spawned by \texttt{sge\_execd} when starting a job
  \item Monitors the execution of a \emph{single} job
  \end{itemize}

\end{frame}


\begin{frame}[fragile]
  \frametitle{Job submission, I}

  The \texttt{qsub} command is used to submit a job to the batch
  system.  The job consists of a shell script and its (optional) arguments.

  \+
  Example:
\begin{semiverbatim}
  qsub myscript.sh 
\end{semiverbatim}

  \+ 
  If any arguments are given after the script name, they will be
  available to the script as \verb'$1', \verb'$2', etc.
\begin{semiverbatim}
  # in myscript.sh, $1="hello" and $2="world"
  qsub myscript.sh hello world
\end{semiverbatim}

\end{frame}


\begin{frame}[fragile,fragile]
  \frametitle{Job submission, II}
  
  Upon successful submission, \texttt{qsub} prints a ``job ID'' to
  standard output:
\begin{semiverbatim}
\$ qsub -cwd myscript.sh
Your job \textbf{76104} ("myscript.sh") has been submitted
\end{semiverbatim}

  \+
  This job ID must be used with all SGE commands that operate on jobs.

  \+
  As soon as the job \emph{starts}, two files will be created,
  containing the script's standard output (\texttt{.o}\emph{jobID})
  and standard error (\texttt{.e}\emph{jobID}).
\begin{semiverbatim}\footnotesize
\$ ls -l myscript.sh*
-rwxrwxr-x 1 murri murri 30 Oct  6 14:23 myscript.sh
-rw-r--r-- 1 murri murri  0 Oct  6 14:24 myscript.sh\textbf{.e76104}
-rw-r--r-- 1 murri murri 14 Oct  6 14:24 myscript.sh\textbf{.o76104}
\end{semiverbatim}
\end{frame}

\begin{frame}
  \frametitle{Commonly used options for \texttt{qsub}}
  
  \begin{description}
  \item[-cwd] Execute job in current directory; if not given, the job
    script is run in the \emph{home} directory.
  \item[-o] Path name of the file where standard output will be stored.
  \item[-e] Path name of the file where standard error will be stored.
  \item[-j] If ``\texttt{-j y}'' is given, then merge standard error
    into standard output (as they were both sent to the screen).
  \end{description}

\end{frame}


\begin{frame}[fragile]
  \frametitle{Monitoring jobs}
  The \texttt{qstat} command is used to monitor jobs submitted to the
  SGE system.

  \+
  Example:
\begin{semiverbatim}\tiny
$ \textbf{qstat}
job-ID  prior   name       user         state submit/start at     queue                          slots ja-task-ID 
-----------------------------------------------------------------------------------------------------------------
  73344 0.60500 mod_run    danielyli    dt    10/06/2011 14:38:45 all.q@compute-0-13.local           2        
  76105 0.50500 myscript.s murri        r     10/06/2011 14:40:35 all.q@compute-0-20.local           1        

\end{semiverbatim}

  The \emph{state} column is a combination of the following codes:
  (see \texttt{man qstat} for a complete list)
  \begin{description}
  \item[r] Job is running
  \item[qw] Job is \emph{w}aiting in the \emph{q}ueue
  \item[qh] Job is being \emph{h}eld back in \emph{q}ueue
  \item[E] An \emph{E}rror has occurred
  \item[d] Job has been \emph{d}eleted by user
  \item[t] Job is being \emph{t}ransferred to compute node
  \end{description}

\end{frame}


\begin{frame}
  \frametitle{Job submission, III}
  Exercises:
  \begin{enumerate}
  \item Write a script \texttt{rank1.sh} to run the command
    ``\texttt{./rank-int.i386 M0,6-D5.sms}'', then run it.
    Does this job appear in \texttt{qstat} output?

    Compare the output with what you would get when running locally:
    is there any significant change?

  \item Write a script \texttt{rank2.sh} to run the command
    ``\texttt{./rank-int.i386 M0,6-D11.sms}'', then run it.
    Does this job appear in \texttt{qstat} output?

    When do the standard output and standard error files appear?
    What's their initial content?

  \item How can you determine the amount of resources (CPU time,
    wall-clock time, etc.) used by a job?
  \end{enumerate}
\end{frame}


\begin{frame}[fragile,fragile]
  \frametitle{Job resource utilization, I}

  The \texttt{qstat -j} command reports information on a job,
  \emph{while it is running}

  \+
  Example:
\begin{semiverbatim}\tiny
\$ \textbf{qstat -j 76106}
==============================================================
job_number:                 76106
exec_file:                  job_scripts/76106
submission_time:            Thu Oct  6 14:51:45 2011
owner:                      murri
{\tiny\em [\ldots]}
cwd:                        /home/murri/lsci
{\tiny\em [\ldots]}
script_file:                myscript.sh
{\color{red}usage    1:                 cpu=00:01:30, mem=8.64453 GBs, io=0.02295, vmem=103.637M, maxvmem=103.637M}
scheduling info:            queue instance "all.q@compute-0-3.local" dropped because it is temporarily not available
{\tiny\em [\ldots]}
\end{semiverbatim}

The ``{\color{red} usage}'' line contains current resource utilization.
\end{frame}



\begin{frame}[fragile]
  \frametitle{Job resource utilization, II}

  The \texttt{qacct} command reports all information on a job, but
  only \emph{after it has completed}.

  \+
  Example:
\begin{semiverbatim}\tiny
\$ \textbf{qacct -j 76106}
==============================================================
qname        all.q               
hostname     compute-0-27.local  
group        murri               
{\tiny\em [\ldots]}
jobname      myscript.sh         
jobnumber    76106
taskid       undefined
{\tiny\em [\ldots]}
qsub_time    Thu Oct  6 14:51:45 2011
start_time   Thu Oct  6 14:51:50 2011
end_time     Thu Oct  6 14:54:29 2011
{\tiny\em [\ldots]}
exit_status  0             
ru_wallclock 159            
ru_utime     158.421        
ru_stime     0.456      
{\tiny\em [\ldots]}
cpu          158.877     
mem          15.183
{\tiny\em [\ldots]}
maxvmem      103.637M
{\tiny\em [\ldots]}
\end{semiverbatim}
\end{frame}


\begin{frame}[fragile]
  \frametitle{Resource utilization, I}
  
  The \texttt{-l} option to \texttt{qsub} allows specifying what
  resources will be needed by a job.

  The most common resource requirements are:
  \begin{description}[mem\_free]
  \item[s\_rt] Total job runtime (wall-clock time), in seconds
  \item[s\_cpu] Total job CPU time, in seconds
  \item[mem\_free] Request at least this much free RAM; use \texttt{m}
    or \texttt{g} suffix for MB or GB
  \item[s\_mem] Upper \emph{limit} to RAM usage; use \texttt{m} or
    \texttt{g} suffix
  \item[s\_vmem] Upper \emph{limit} to virtual memory usage; use \texttt{m} or
    \texttt{g} suffix
  \end{description}
  
  \+
  Example:
\begin{semiverbatim}
# run job with a time limit of 20 seconds
\$ qsub \textbf{-l s_rt=20} myscript.sh
\end{semiverbatim}
  
\end{frame}

\begin{frame}[fragile]
  \frametitle{Resource utilization, II}

  \emph{Exercises:}\small
  \begin{enumerate}
  \item Is the following job limited to 20 seconds runtime?
\begin{semiverbatim}\small
\$ qsub \textbf{-l s_rt=20} rank2.sh
\end{semiverbatim}
    What do you find the in the job's stdout and stderr file?
    Compare with what happens in the \texttt{ulimit} case.

    What happens if you replace \texttt{s\_rt} by \texttt{s\_cpu}?

  \item Run the same job, putting a 10MB limit on \texttt{mem\_free},
    then \texttt{s\_rss}, \texttt{s\_mem}, and finally
    \texttt{s\_vmem}.

    Compare the actual resource utilization (via \texttt{qacct}) with
    the requirement.  

    In what cases does the job terminate correctly?  What's the
    resource utilization in this cases?

  \item Compile a table with runtime, CPU time, and memory utilization
    for each of the matrices \texttt{M*.sms}.  

    Is there a correlation with the matrix file size?
  \end{enumerate}

\end{frame}


\section{Further reading}

\begin{frame}
  \frametitle{References}
  \begin{enumerate}[{[1]}]
  % \item Sun/Oracle Grid Engine home page,
  % \item Open Grid Scheduler home page,
  \item \texttt{setrlimit(2)} manual page, \url{http://manpages.ubuntu.com/manpages/oneiric/en/man2/getrlimit.2.html}
  \end{enumerate}
\end{frame}

\end{document}

