\documentclass[english,serif,mathserif,xcolor=pdftex,dvipsnames,table]{beamer}
\usepackage[utf8]{inputenc}
%\usepackage[T1]{fontenc}
\usepackage{babel}
\usepackage{fixltx2e}
\usepackage{graphicx}
\usepackage{colortbl}%
  %\newcommand{\cellcolor}[2]{\multicolumn{1}{>{\columncolor{#1}}c}{#2}}
\usepackage{listings}%
  \lstloadlanguages{sh}%
  \lstset{
    language=sh,%
    % --- basic appearance ---
    basicstyle=\ttfamily,%
    %columns=fullflexible,% best results for proportional fonts
    commentstyle=\small,%
    keywordstyle=\bfseries,% or \normalfont
    %identifierstyle=\itshape,%
    %procnamestyle=\bfseries\slshape, %\scshape,%
    %procnamekeys={def},%
    % --- escaping and special displays ---
    escapechar=@,% text between "@" will be rendered as normal TeX
    %moredelim=[il][\small\itshape]{\#},% ditto for text beween "#" and end-of-line
    texcl,%
    mathescape=false,%
    %literate={*{=}{{$\gets$ }}1 {==}{{$=$ }}1 {<=}{{$\leq$ }}1 {>=}{{$\geq$ }}1 {!=}{{$\neq$ }}1},%
    % --- display ---
    %showspaces=false,%
    %showstringspaces=false,%
    %xleftmargin=2em,%
    % --- line numbers ---
    %numbers=left,%
    %numberstyle=\tiny,%
    %stepnumber=1,%
    %firstnumber=1%
  }%
  \lstMakeShortInline{@}%
\usepackage{longtable}
\usepackage{multirow}
\usepackage{float}
\usepackage{wrapfig}
\usepackage{soul}
\usepackage{textcomp}
\usepackage{tikz}%
  \usetikzlibrary{arrows,shapes}%
  % For every picture that defines or uses external nodes, you'll have to
  % apply the 'remember picture' style. To avoid some typing, we'll apply
  % the style to all pictures.
  \tikzstyle{every picture}+=[remember picture]%
\usepackage{marvosym}
\usepackage{wasysym}
\usepackage{latexsym}
\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{url}
\tolerance=1000
\providecommand{\alert}[1]{\textbf{#1}}

\usetheme{uzhneu-en-informal}

%% \title[Large Scale Computing Infrastructures]{%
%%   \textbf{Large Scale Computing Infrastructures}\\
%%   \small{(MINF 4526 HS2011)}\\
%%   \small{Lecture 2: from clusters to distributed systems}
%% }

%% \author[S.\ Maffioletti] {%
%%   Sergio Maffioletti \\ 
%%   \texttt{<sergio.maffioletti@gc3.uzh.ch>} \\
%%   \institute[GC3, Univ. of Zurich]% will appear on the bottom line
%%             {\href{http://www.gc3.uzh.ch/}{Grid Computing Competence Centre}, 
%%               \href{http://www.uzh.ch/}{University of Zurich}
%%               \\ \url{http://www.gc3.uzh.ch/}}
%% }

\title[Large Scale Computing Infrastructures]{%
   Large Scale Computing Infrastructures \\
   \small{MINF 4526 HS201} \\
   \small{Lecture 2: from clusters to distributed systems}
 }
\author[RS Maffioletti]{%
   \textbf{Sergio Maffioletti} \\
   Grid Computing Competence Center, \\
   Organisch-Chemisches Institut, \\
   University of Zurich
 }
% \date{Sept.~27,~2011}

%% Use `\largeskip` to get a larger vertical white space between two
%% lines/paragraphs:
\newcommand{\largeskip}{\vspace{1em}}
\def\+{\largeskip}
\setlength{\parsep}{1.0em}

\begin{document}

% title frame
\maketitle

% The goal of this class is to discuss together with the students the building of a distributed system
% have them understand the recurrent issues and design patterns that have ot be taken into account
% how do we develop an application that uses a cluster model
% how do we develop an application that uses a distributed system model
% if a distributed system would look alike a single large centralized cluster system,
% what would we still need for an HTC application ?

% TOC
\begin{frame}
  
  \frametitle{Table of content}
  \begin{enumerate}
  \item{} What characterize a cluster
  \item{} What characterize a distributed systems
  \item{} Let's build a middleware 
  \item{} Write applications for distributed systems
  \end{enumerate}
  
\end{frame}

\begin{frame}
  \frametitle{What characterize a cluster}
  % image of interconnected nodes
  \begin{itemize}
  \item{} a collection of {\it parallel} and {\it distributed processing} system
  \item{} work as a {\it single integrated} computing resource
  \item{} Data available on the cluster: {\bf Application}, {\bf input data} and {\bf results}
  \item{} {\bf Minimal} control on the system
  \item{} Network {\bf File Server} involved
  \item{} {\bf Execution} needs to be described (i.e. {\it Resource requirements})
  \item{} {\bf Performance} can be tuned by adapting execution to hosting environments
  \end{itemize}

\end{frame}

\begin{frame}
  \frametitle{Basic structure of a cluster}
  \includegraphics[height=0.6\textheight]{cluster00.png}
\end{frame}

\begin{frame}
  \frametitle{Files server integration}
  % image with fileserver and nodes mounting filesystem (/home /scratch /apps)
  \includegraphics[height=0.6\textheight]{cluster01.png}
\end{frame}
\begin{frame}
  \frametitle{LRMS}
  % image with LRMS Server node + queues + sched + exec nodes
  \includegraphics[height=0.5\textheight]{cluster_lrms.png}
\end{frame}

\begin{frame}
  \frametitle{What is a cluster}
  \begin{itemize}
  \item{} {\bf Centralized}
    \begin{itemize}
    \item{} Authorization and Authentication
    \item{} Home and Scratch filesystem
    \item{} Application execution and management control
    \end{itemize}
  \item{} {\bf Distributed}
    \begin{itemize}
    \item{} Resource access (but controlled by an LRMS)
    \item{} Multiple units of the same job may reside on separate resources
    \end{itemize}
  \end{itemize}
\end{frame}


%% \begin{frame}
%%   % Maybe this one not
%%   \frametitle{What is a cluster}
%%   \begin{itemize}
%%   \item{} {\bf Inter Process Communication}
%%     \begin{enumerate}
%%     \item{} Single thread application: everything local
%%     \item{} OpenMP: shared memory systems
%%     \item{} MPI: use messages to synch between processes
%%     \item{} File-based communication
%%     \end{enumerate}
%%   \end{itemize}
%% \end{frame}

\begin{frame}
  \frametitle{Lifecycle of a Job: user perspective}
  % commands to be used. detached execution model
  % job execution from LRMS point of view
  \begin{enumerate}
    \item{} Prepare job {\it script} (normally shell script)
    \item{} Define {\it resource} requirements
    \item{} {\it Submit} job and record jobID
    \item{} {\it Monitor} status of job (using JobID)
    \item{} When {\bf done} inspect {\it results}
    \item{} Otherwise check {\it logs}
  \end{enumerate}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Prepare job script}
  \begin{lstlisting}
 #!/bin/bash

 MZXMLSEARCH="./MzXML2Search"

 ${MZXMLSEARCH} -dta ${MZXML_NAME}.mzXML
 if [ ! $? -eq 0 ]; then
    echo "[FATAL]"
    exit $1
 fi
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Define resource requirements}
  \begin{lstlisting}
#\$/bin/bash

#\$ -q default.q \# queue name
#\$ -l s\_vmem=300M \# Memory requirement
#\$ -l s\_rt=::60 \# walltime requirement
#\$ -pe mpich 32 \# cores requirement

MZXMLSEARCH="./MzXML2Search"
...
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Submit job and monitor using jobID}
  \begin{lstlisting}
# qsub test.sh 
  534.localhost

  \end{lstlisting}
  \begin{lstlisting}

# qstat 534
Job id           Name    S Queue
--------------- -------- - -------
534.localhost   test.sh  R default    
  \end{lstlisting}
\end{frame}

\begin{frame}
  \frametitle{Lifecycle of a Job: system perspective}
  % commands to be used. detached execution model
  % job execution from LRMS point of view
  \begin{enumerate}
  \item{} Job submission form LRMS client
  \item{} Resource Manager stores job in a {\it queue}
    \begin{itemize}
    \item{} Queue selected inspecting LRMS policies and job's description
    \end{itemize}
  \item{} {\it Scheduler} starts scheduling cycle
    \begin{itemize}
    \item{} Collects resource information from exec hosts
    \item{} Inspects jobs in queues
    \item{} Applies scheduling policies to sort jobs in queues
    \item{} Sends run request to Resource Manager
    \end{itemize}
  \item{} Resource Manager sends job to exec host to run
  \item{} Exec host receives payload and runs it
    \begin{itemize}
    \item{} Job executed using user credentials
    \item{} Periodically report to Resource Manager resource utilization
    \item{} When job finished, reports to Resource Manager
    \end{itemize}
  \item{} Resource Manager updates job's state
  \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{Job's lifecycle}
  % prendi log da un job SGE (come tracejob)
  \includegraphics[height=0.8\textheight]{tracejob.png}  
\end{frame}

\begin{frame}
  \frametitle{Example of Resource requirements}
  \label{sec:7}
  \begin{itemize}
  \item{} {\bf cput}: max CPU time used by all processes in the job
  \item{} {\bf pcput}: max CPU time used by any single process in the job
  \item{} {\bf mem}: max amount of physical memory used by the job
  \item{} {\bf pmem}: max amount of physical memory used by any process of the job
  \item{} {\bf vmem}: max amount of virtual memory used by the job
  \item{} {\bf pvmem}: max amount of virtual memory used by any process of the job
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Example of Resource requirements cont.}
  \label{sec:7a}
  \begin{itemize}
  \item{} {\bf walltime}: wall clock time running
  \item{} {\bf file}: the largest size of any single file that may be created by the job
  \item{} {\bf host}: name of the host on which job should be run
  \item{} {\bf nodes}: number and/or type of nodes to be reserved for exclusive use by the job
  \end{itemize}
\end{frame}

%% \begin{frame}
%%   \frametitle{What are the scalalbility issues ?}
%%   % query LRMS too often
%%   % in general, iterate individual actions (try to work on groups)
%%   % consider system failures
%%   % detached modules (avoid SPOFs when possible)
%%   % Resiliancy
%%   % Use checkpoints
%%   % memory awareness (avoid load all data in memory at once) - bufferd actions
%%   % avoid strict synchronization (do not wait on all components to be completed)
%% \end{frame}

%%%%%%%%% Distributed systems 

\begin{frame}
  \frametitle{Distributed System}
  \label{sec:5c}

  \begin{itemize}
  \item{} Data are not available on the destination systems: {\bf Application} and {\bf input data}
  \item{} Application binaries may or may not be available (and if so, where ?)
  \item{} {\bf No} control on the system
  \item{} {\bf No global} control on the system
  \item{} {\bf No global} home or scratch shared space
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Distributed System}
  \label{sec:5c}

  \begin{itemize}
  \item{} Execution needs to be {\bf normalized} for each system 
    \begin{itemize}
    \item{} cope with different LRMS
    \item{} i386 vs x86\_64
    \item{} different communication libraries
    \item{} different OS
    \end{itemize}
  \item{} {\bf Execution} needs to be described (i.e. {\it Resource requirements})
    \begin{itemize}
    \item{} different LRMS have different resource representation
    \item{} same resource may have different semantic
    \item{} same resource may have different unit
    \end{itemize}
  \item{} {\bf Performance} vary greatly depending on the execution system
  \end{itemize}
  
\end{frame}

\begin{frame}
  \frametitle{Distributed System, cont.}
  \label{sec:5d}
  \begin{itemize}
  \item{} {\bf non-uniform access}. Different accounts; different access policies; different administrators.
  \item{} {\bf Reliability} same as for cluster systems
  \item{} {\bf Reliability II} communication between systems may trigger new failures
  \item{} {\bf Asynchronous} execution (controlled by a layer of Local Resource Management Systems)
  \item{} {\bf Parallel} execution of applications leveraging all systems.
  \item{} {\bf MPI/OpenMP parallelism} possible within single cluster.
  \item{} {\bf Scalability} is measured against all systems
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Distributed System}
  \label{sec:5e}

  \begin{itemize}
  \item{} {\it Multiple} Clusters
  \item{} Owned by a {\it multiple institution}
  \item{} Multiple security and access {\it policies}
  \item{} {\it Volatile} environment (It is always better to check before start executing)
  \item{} Resources are {\it heterogeneous}
  \item{} Resources are {\it geographically} distributed
  \item{} Multiple resource {\it management} policies
  \item{} Connected by {\it heterogeneous, multi-level} networks
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Let's build a distributed system middleware}
  \label{sec:5e}
  \begin{itemize}
  \item{}...
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Let's build a distributed system middleware}
  \label{sec:5e}
  \begin{itemize}
  \item{} {\it Uniform} access to resources
  \item{} {\it Global} resource management
  \item{} {\it Uniform} resource description
  \item{} Resource {\it scheduling}
  \item{} {\it Data} management
  \item{} {\it Application} management
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Uniform access to resources}
  \includegraphics[height=0.8\textheight]{grid_access.png}  
\end{frame}

\begin{frame}
  \frametitle{Uniform access to resources}
  \label{sec:5e}
  \begin{itemize}
  \item{} Need {\it global} identity valid on {\bf all} systems (proof of right)
    \begin{itemize}
    \item{} Users, hosts, and services need to be able to authenticate themselves 
    \item{} each party establishes a level of trust in the identity of the other party.
    \end{itemize}
  \item{} {\it Authentication}: identity needs to be acknowled for all systems
  \item{} {\it Authorization}: access to site resources is based on the global identity
  \item{} Available {\it services} have to be aware of global identity for Authentication and Authorization
  \item{} \url{www.ogf.org/documents/GFD.38.pdf}
  \item{} \url{http://www.ogf.org/documents/GFD.17.pdf}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Uniform access: approaches}
  \begin{itemize}
  \item{} Use world-wide recognized Certification authorities
  \item{} Use cross-platform identify systems (e.g. OpenID, X509 certificates)
    \begin{itemize}
    \item{} \url{openid.net}
    \item{} \url{http://www.ietf.org/rfc/rfc2459.txt}
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Global resource management}
  \includegraphics[height=0.8\textheight]{global_management.png}  
\end{frame}

\begin{frame}
  \frametitle{Global resource management}
  \label{sec:5e}
  \begin{itemize}
  \item{} need to have a concept of a federated LRMS
  \item{} keep same cluster model abstraction (job management)
  \item{} Work as a {\it meta-cluster} (cluster of clusters)
  \item{} need a common LRMS interface
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Resource management: approaches}
  \begin{itemize}
  \item{} Use a meta-scheduler that can control clusters (e.g Gridway, Gridbus)
  \item{} Use an overlay LRMS-like abstraction on each site that translate requests to local LRMS (globus, ARC, GLite Unicore)
    \begin{itemize}
    \item{} \url{http://www.gridway.org/}
    \item{} \url{http://www.cloudbus.org}
    \item{} \url{http://www.globus.org/}
    \item{} \url{http://www.nordugrid.org/}
    \item{} \url{http://glite.cern.ch/}
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Uniform resource description}
  \label{sec:5e}
  \begin{itemize}
  \item{} need to be able to express resource requirement in consistent way
  \item{} two approaches:
    \begin{itemize}
      \item{} find common denominator (reduced set of directives, consistent with system behavior)
      \item{} use everything (larger set of directives, not consistent with system behavior)
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Resource description: approaches}
  \begin{itemize}
  \item{} Use standardardized resource description languages
  \item{} Globus Resource Specification Language (RSL)
  \item{} Job Submission Description Language (JSDL)
    \begin{itemize}
    \item{} JSDL \url{www.gridforum.org/documents/GFD.56.pdf}
    \item{} RSL \url{http://www.globus.org/toolkit/docs/2.4/gram/rsl_spec1.html}
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Resource scheduling}
  \includegraphics[height=0.8\textheight]{uniform_information.png}  
\end{frame}

\begin{frame}
  \frametitle{Resource scheduling}
  \label{sec:5e}
  \begin{itemize}
  \item{} like in cluster, need to take a decision where a job should be executed
  \item{} need to have information about all clusters and all resources
  \item{} information need to be coherent (have standard resource representation schema)
  \item{} scheduling policies done on global system
  \item{} very difficult to have fine grained policies
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Resource representation: approaches}
  \begin{itemize}
  \item{} Grid Laboratory Uniform Environment (GLUE) schema
  \item{} \url{http://forge.ogf.org/sf/projects/glue-wg}
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Data management}
  \includegraphics[height=0.8\textheight]{data_transfer.png}  
\end{frame}

\begin{frame}
  \frametitle{Data management}
  \label{sec:5e}
  \begin{itemize}
  \item{} need to move data from one location to another
  \item{} need common file transfer protocol
  \item{} as all other services, need to be aware of {\it global} identity for Authentication and Authorization
  \item{} need to use file transfer protocol also in job description
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Data management: approaches}
  \begin{itemize}
  \item{} USe standardized transfer protocol (e.g http(s), GridFTP)
    \begin{itemize}
    \item{} \url{http://dev.globus.org/wiki/GridFTP}
    \item{} \url{http://www.w3.org/Protocols/}
    \end{itemize}
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Application management}
  \label{sec:5e}
  \begin{itemize}
  \item{} How to deploy application ?
    \begin{itemize}
    \item{} how to produce binaries for all different systems ?
    \item{} how to benchmark all systems
    \item{} how to debug system related issues (numerical stability problem) ?
    \end{itemize}
  \item{} How to configure application's environment ?
  \item{} How to guarantee application environment to be consistent ?
  \end{itemize}
\end{frame}

%% \begin{frame}
%%   \frametitle{What is a distributed system}
%%   \begin{itemize}
%%   \item{} collectoin of computing resources (clusters)
%%   \item{} each controlled by different administrators
%%   \item{} Applications may not be always available (and if so, where ?)
%%   \item{} data not available (need to be staged there)
%%   \item{} No global control on the system
%%   \item{} No global home or shared space
%%   \item{} Execution need to be normalized for each system

%%   \item{} Where are my results ?

%%   \item{} Even higher failure rate
%%   \item{} how to access each cluster ?

%%   \item{} how to distribute the load among them ?

%%   \item{} how to normalize the execution (also how to cope with different LRMSs)
%%   \item{} Heterogeneous systems
%%   \item{} Why do we need an information system ?
%%   \item{} Performace vary greatly depending on the execution system
%%   \item{} Job failure could be induced by new factors (like auth. problem)
%%   \end{itemize}
%% \end{frame}

\begin{frame}
  \frametitle{Building scalable applications}
  \begin{itemize}
  \item{} Increasing resources results in a proportional increase in performances
  \item{} A scalable service is capable of handling heterogeneity
  \item{} A scalable service is operationally efficient
  \item{} A scalable service is resilient
  \item{} A scalable service should become more cost effective when it grows
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Best Practices}
  \begin{itemize}
  \item{} Design for failures
  \item{} Decouple components
  \item{} Cope with Heterogeneity
  \item{} Security, security, security (we'll see next time)
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Design for failures}
  \begin{itemize}
  \item{} Backup and restore strategy 
  \item{} System should be resilient to all possible failures you can think
  \item{} Always check intermediate state's consistency
  \item{} Distinguish between software bug and system failure
  \item{} Allow the state of the system to re-sync
  \item{} Avoid in-memory statefull objects. Persist and checkpoint when possible
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Decouple components}
  \begin{itemize}
  \item{} avoid components with tight dependencies with each other
  \item{} Limit block's synchronization (wait till every task has finished)
  \item{} Use asynchronous communication
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Cope with Heterogeneity}
  Adapt to
  \begin{itemize}
  \item{} number of available resources
  \item{} performances
  \item{} reactiveness of a system
  \end{itemize}
\end{frame}
                 
\end{document}

