\documentclass[english,serif,mathserif,xcolor=pdftex,dvipsnames,table]{beamer}
\usepackage{lsci}
\usepackage{epigraph}

\title[Grid resource allocation]{%
  Resource Allocation \\
  in computational Grids
}
\author[R. Murri]{%
  \textbf{Riccardo Murri} \\
  Grid Computing Competence Center, \\
  Organisch-Chemisches Institut, \\
  University of Zurich
}
\date{Nov.~23,~2011}


\begin{document}
% title frame
\maketitle

\section{Introduction}

% scheduling on a local cluster
% - resource allocation model: key=value attributes
% - performance: data from ping/qping: RTT ~0.3ms
\begin{frame}
  \frametitle{Scheduling on a cluster}
  \begin{center}
    \includegraphics[width=0.8\linewidth]{lecture09/cluster}

  \+
  All job requests sent to a central server.

  \+
  The server decides which job runs \emph{where} and \emph{when}.
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{\emph{where}: resource allocation model}
  \textbf{Computing resources are defined by a structured set of attributes}
  (key=value pairs).

  \+ 
  SGE's default configuration defines 53 such attributes: number of
  available cores/CPUs; total size of RAM/swap; current load average; etc.

  \+
  \textbf{A node is eligible for running a job iff the node attributes are
  compatible with the job \emph{resource requirements}.}

  \+
  (Other batch systems are similar.)
\end{frame}


\begin{frame}
  \frametitle{\emph{when}: scheduling policy}
  There are usually more jobs than the system can handle concurrently.
  (Even more so, in high-throughput computing cases we are interested
  in.)

  \+
  So, job requests must be \emph{prioritized}.

  \+
  \textbf{Prioritization of requests is a matter of the \emph{local scheduling
  policy.}}

  \+
  (And this differs greatly among batch systems and among sites.)
\end{frame}


\begin{frame}
  \frametitle{(Hidden) assumptions}
  1. \textbf{The scheduling server has complete knowledge of the
    nodes}

  \+ Local networks have \emph{low latency} (RTT average 0.3 ms on a
  1GB/s ethernet) and the status information is a small packet.
  
  \+
  2. \textbf{The server has complete \emph{control} over the nodes}
  
  \+
  So a compute node will \emph{immediately} execute a job when told by the
  server. 
\end{frame}


\begin{frame}
  \frametitle{How does this extend to Grid computing?}
  By definition of a Grid\ldots

  \+ 1. \textbf{It's geographically distributed}
    \begin{itemize}
    \item High-latency links (hence: resource status may be not up-to-date)
    \item Network is easily partitioned or nodes disconnected
      (hence: resources have a dynamic nature; they may come and go)
    \end{itemize}

  \+
  2. \textbf{Resources come from \emph{multiple} control domains}
  \begin{itemize}
  \item Prioritization is a matter of \emph{local} policy!
  \item AuthZ and other issues may prevent execution at all.
  \end{itemize}
\end{frame}


\section{The Globus/ARC model}

\begin{frame}
  \frametitle{The Globus/ARC model}
  \begin{center}
    \includegraphics[width=0.8\linewidth]{lecture09/globus-arc}

    \+
    An infrastructure is a set of independent clusters.

    \+
    The client host \emph{selects} one cluster and submits a job
    there.  Then periodically \emph{polls} for status information.
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{Issues in the Globus/ARC approach?}
  \begin{enumerate}
  \item How to select a ``good'' execution site?
  \item How to gather the required information from the sites?
  \item Based on the same information, two clients can arrive on the same
    scheduling information, hence they can flood a site with jobs.
  \item Actual job start times are unpredictable, as scheduling is
    ultimately a local decision.
  \item Client polling increases the load \emph{linearly} with the
    number of jobs.
  \end{enumerate}
\end{frame}


\subsection{The MDS InfoSystem}

\begin{frame}
  \frametitle{The MDS InfoSystem, I}
  \begin{center}
    \includegraphics[width=\linewidth]{lecture09/giis-gris}

    \+
    The Globus \emph{Monitoring and Discovery Service}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{The MDS InfoSystem, II}
  A specialized service provides information about site status.

  \+
  Each \emph{site} reports its information to a local database (GRIS).

  \+
  Each GRIS registers with a global indexing service (GIIS).

  \+
  The client talks with the GIIS to get the list of sites, and then
  queries each GRIS for the site-specific information.
\end{frame}


\subsection{LDAP}

\begin{frame}
  \frametitle{LDAP}
  The protocol underlying MDS is called
  \href{http://en.wikipedia.org/wiki/Ldap}{LDAP}.

  \+
  LDAP allows remote read/write accesses to a distributed database
  (``X.500 directory system''), with a flexible authentication and
  authorization scheme.
  
  \+
  LDAP makes the assumptions that most accesses are \emph{reads}, so
  LDAP servers are optimized for infrequent writes.

  \+
  \begin{references}
    A.~S.~Tanenbaum, ``Computer Networks,'' ISBN~978-0-13-212695-3
  \end{references}
\end{frame}


\begin{frame}
  \frametitle{LDAP schemas}
  Entries in an LDAP database are sets of key/value pairs. (Keys need
  not be unique; equivalently: a key can map to multiple values.)

  \+
  An \emph{LDAP schema} specifies the names of allowed keys, and the type of
  corresponding values.

  \+
  \emph{Each entry} declares a set of schemas it conforms to; every
  attribute in an LDAP entry must be defined in some schema.
\end{frame}


\begin{frame}
  \frametitle{X.500/LDAP Directories}
  
  Entries are organized into a \emph{tree structure} (DIT).
  (So LDAP queries return subtrees, as opposed to flat sets of rows as
  in a RDBMS query.)

  \+
  Each entry is uniquely identified by a ``Distinguished Name'' (DN).
  The DN of an entry is formed by appending a one or more attribute
  values to the parent entry's DN.

  \+
  LDAP accesses might result in \emph{referrals}, which redirect the
  client to access another entry at a remote server.
\end{frame}


\begin{frame}[fragile]
  \frametitle{Example}
Example: this is how the ARC MDS represent information about a cluster
queue in LDAP.
\begin{semiverbatim}\scriptsize
# all.q, gordias.unige.ch, Switzerland, grid
dn: nordugrid-queue-name=all.q,nordugrid-cluster-name=gordias.unige.
 ch,Mds-Vo-name=Switzerland,o=grid
objectClass: Mds
objectClass: nordugrid-queue
nordugrid-queue-name: all.q
nordugrid-queue-status: active
nordugrid-queue-comment: sge default queue
nordugrid-queue-homogeneity: TRUE
nordugrid-queue-nodecpu: Xeon 2800 MHz
nordugrid-queue-nodememory: 2048
nordugrid-queue-architecture: x86_64
nordugrid-queue-opsys: ScientificLinux-5.5
nordugrid-queue-totalcpus: 224
nordugrid-queue-gridqueued: 0
nordugrid-queue-prelrmsqueued: 4
nordugrid-queue-gridrunning: 0
nordugrid-queue-running: 0
nordugrid-queue-maxrunning: 136
nordugrid-queue-localqueued: 4
\end{semiverbatim}
\end{frame}


\subsection{The ``schema'' problem}

\begin{frame}
  \begin{center}
    Based on the information in the previous slide,
    \\
    can you decide whether to send a job 
    \\
    that requires 200GB of scratch space 
    \\
    to this cluster?
  \end{center}
\end{frame}


\begin{frame}
  \frametitle{The MDS ``cluster'' model}
  Exactly: there's no way to make that decision.

  \+
  ARC (and Globus) only provide CPU/RAM/architecture information.

  \+
  In addition, they assume clusters are organized into homogeneous
  \emph{queues}, which might not be the case.
  
  \+ 
  This is just an example of a more general problem: \textbf{what
    information do we need of a remote cluster and how to represent
    it?}

  \+
  \begin{references}
    B. Kónya, ``The ARC Information System'', \url{http://www.nordugrid.org/documents/arc_infosys.pdf}
  \end{references}
\end{frame}


\begin{frame}
  \frametitle{MDS performance}

  The complete LDAP tree of the SMSCG grid counts over 28000 entries.

  \+
  A full dump of the SMSCG infosystem tree requires about 30~seconds.

  \+
  So:
  \begin{enumerate}
  \item Information is several seconds old (on average)
  \item It does not make sense to refresh information more often that
    this.
  \end{enumerate}

  \+
  By default, ARC refreshes the infosystem every 60 seconds.
\end{frame}


\subsection{Use cases discussion}

\begin{frame}
  \frametitle{Supported and unsupported use cases, I}
  \textbf{Pre-installed application:
    \color{PineGreen} OK}

  \+
  The ARC InfoSys has a generic mechanism (``run time environments'')
  for providing ``installed software'' information.

  \+
  So you can select only sites that provide the application you want.

  \+
  (And the information provided in the InfoSys is usually enough
  to make a good guess about the overall performance.)
\end{frame}


\begin{frame}
  \frametitle{Supported and unsupported use cases, II}
  \textbf{Single-thread CPU-intensive native binary:
    \color{PineGreen} OK}

  \+
  \emph{However,}
  \begin{itemize}
  \item the binary cannot not require ``unusual'' dynamic libraries;
  \item the binary cannot use CPU-specific features (no information
    on CPU model, so you cannot broker on that).
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Supported and unsupported use cases, III}
  \textbf{Java/Python/Ruby/R script}

  \+
  Require brokering based on a large number of support
  library/packages: if the dependencies are not there, the program
  cannot run.

  \+
  \emph{In theory,} this solves the issue.  \emph{In practice:} there is
  always less information that would be useful, and providing all the
  information that would be useful is too much work.

  \+
  Ultimately, it relies on convention and ``good practice.''
\end{frame}


\begin{frame}
  \frametitle{Supported and unsupported use cases, IV}
  \textbf{Code benchmarking:
    \color{Red} FAIL}

  \+
  Benchmarking code requires running all cases under the same
  conditions.

  \+ 
  There is just no way to guarantee that with the ``federation of
  clusters'' model: e.g., the site batch scheduler may run two jobs on
  compute nodes with a different CPU.
\end{frame}


\begin{frame}
  \frametitle{Supported and unsupported use cases, V}
  \textbf{Parallel jobs: \color{Red} FAIL}

  \+
  You can request a certain number of CPUs, but you have no
  information and no control over:
  \begin{itemize}
  \item CPU/threads allocation: all slots in a single
    large SMP machine? slots distributed evenly across nodes?
  \item communication mechanism: which MPI library is used? which
    transport fabric?
  \end{itemize}

  \+
  (In theory, this can be solved by a careful choice of ``run time
  environments''.  In practice, it means that everybody has to agree
  how to represent that information, so it just replicates the schema
  problem.)
\end{frame}


\subsection{ARC: wrap-up and summary}

\begin{frame}
  \frametitle{ARC: Pros and Cons}
  Pros:
  \begin{itemize}
  \item Very simple to deploy, easy to extend.
  \item System and code complexity still manageable.
  \end{itemize}

  \+
  Cons:
  \begin{itemize}
  \item The burden for scaling up is on each site, but not all sites
    have the required know-how/resources.
  \item Complexity of managing large collections of jobs is on the
    \emph{client software} side.
  \item Fixed infosystem schema does not accomodate certain use cases.
  \end{itemize}
\end{frame}


\section{The gLite WMS}

\begin{frame}
  \frametitle{The gLite approach}
  \begin{center}
    \includegraphics[width=\linewidth]{lecture09/wms}
  \end{center}
  \begin{references}
    \footnotesize \url{http://web.infn.it/gLiteWMS/index.php/component/content/article/51-generaltechdocs/57-archoverview}
  \end{references}
\end{frame}


\subsection{WMS}

\begin{frame}
  \frametitle{The gLite WMS}
  Server-centric architecture:
  \begin{itemize}
  \item All jobs are submitted to the WMS server.
  \item WMS inspects the Grid status, makes the scheduling decision
    and submits jobs to sites.
  \item The WMS also monitors jobs as they run, and fetches back the
    output when a job is done.
  \item The client polls the WMS, and when a job is done gets the
    output from the WMS.
  \end{itemize}
\end{frame}


\subsection{The gLite InfoSystem}

\begin{frame}
  \frametitle{The gLite infosystem, I}
  Hierarchical architecture, based on LDAP:
  \begin{enumerate}
  \item Each ``Grid element'' runs its own LDAP server (\emph{resource
    BDII}) providing
    information on the software status and capabilities.
  \item A \emph{site-BDII} polls the local
    element servers, and aggregates information into a site view.
  \item A \emph{top-BDII} polls the site BDIIs and aggregates
    information into a global view.
  \end{enumerate}

  \+
  Each step requires processing the collected entries and creating a
  new LDIF tree based on the new information.
\end{frame}


\begin{frame}
  \frametitle{The gLite infosystem, II}

  The CREAM computing element at CSCS has $43$ entries in its
  \emph{resource BDII}. Listing them takes $0.5$ seconds.

  \+
  The CSCS \emph{site-BDII} has $191$ entries. Listing them takes
  $0.5$ seconds.

  \+ The CERN \emph{top-BDII} has $>180'000$ entries, collected from
  circa $200$ sites.  Listing them all takes over 2 minutes time.
\end{frame}


\begin{frame}
  \frametitle{The GLUE schema}
  The gLite information system represents systems status based on the
  \href{http://www.ogf.org/documents/GFD.147.pdf}{GLUE schema}.
  (Version 1.3 currently being phased out in favor of v.~2.0)

  \+
  Comprehensive and complex schema:
  \begin{enumerate}
  \item aimed at interoperability among Grid providers;
  \item attempt to cover every feature supported by the major
    middlewares and production infrastructures (esp.~HEP);
  \item heavy use of cross-entry references.
  \end{enumerate}

  \+
  Can accomodate the ``scratch space'' example, but there's
  still no way of figuring out whether (and how) a job can request 16
  cores on the same physical node.
\end{frame}


\begin{frame}
  \frametitle{Comparison with ARC's InfoSystem}
  
  ARC stores information about \emph{jobs} and users in the
  infosystem:
  \begin{itemize}
  \item relatively large number of entries
    in the ARC infosys
  \item cannot scale to a large high-throughput
    infrastructure
  \end{itemize}

  \+
  However, gLite's BDII puts a large load on the \emph{top BDII}:
  \begin{itemize}
  \item must handle load from all clients
  \item must be able to poll all \emph{site-BDIIs} in a fixed time
  \item so it must cope with network timeouts, slow sites, etc.
  \end{itemize}
\end{frame}


\subsection{gLite: wrap-up}

\begin{frame}
  \frametitle{gLite WMS: Pros and Cons}
  Pros:
  \begin{itemize}
  \item Global view of the Grid, could take better meta-scheduling decisions.
  \item Can support aggregate job types (e.g., workflows)
  \item Aggregates the monitoring operations, so reduces the load on site.
  \end{itemize}

  \+
  Cons:
  \begin{itemize}
  \item The WMS is a single point of failure.
  \item Clients still use a polling mechanism, so the WMS must sustain
    the load.
  \item Extremely complex piece of software, running on a single
    machine: very hard to scale up!
  \item Relies on a infosystem to take sensible decisions
    (fixed schema/representation problem).
  \end{itemize}
\end{frame}


\section{Condor}

\begin{frame}
  \frametitle{Condor}
  \begin{center}
    \includegraphics[width=0.9\linewidth]{lecture09/condor.pdf}
  \end{center}
\end{frame}


\begin{frame}
  \frametitle{Condor overview}
  
  \emph{Agents} (client-side software) and \emph{Resources}
  (cluster-side software) advertise their requests and capabilities to
  the Condor \emph{Master}.
  
  \+ 
  The Master performs \emph{match-making} between Agents'~requests
  and Resources'~offerings.

  \+ 
  An Agent sends its computational job directly to the matching
  Resource.

  \+
  \begin{references}
    \footnotesize
    Thain,~D., Tannenbaum,~T. and Livny,~M. (2005): ``Distributed
    computing in practice: the Condor experience.'' \emph{Concurrency
      and Computation: Practice and Experience}, 17:323--356.
  \end{references}
\end{frame}


\begin{frame}
  \frametitle{What is matchmaking?}
  \begin{center}
    \includegraphics[width=0.9\linewidth]{lecture09/matchmaking-site}
  \end{center}
\end{frame}


\begin{frame}
  \frametitle{Matchmaking, I}
  Same idea in Condor, except \textbf{the schema is not fixed.}

  \+
  Agents and Resources report their requests and offers using the
  ``ClassAd'' format (an enriched key=value format).

  \+
  No prescribed schema, hence a Resource is free to advertise any
  ``interesting feature'' it has, and to represent it in any way that
  fits the key=value model.
\end{frame}


\begin{frame}
  \frametitle{Matchmaking, II}
  \emph{1.} Agents specify a \texttt{Requirements} constraint: a
  boolean expression that can use any value from the Agents' own \emph{(self)}
  ClassAd or the Resource's \emph{(other)}.

  \+
  \emph{2a.} Resources whose offered ClassAd \emph{does not} satisfy the
  \texttt{Requirements} constraint are discarded.

  \+
  \emph{2b.} Conversely, if the Agents' ClassAd does not satisfy the
  Resource \texttt{Requirements}, the Resource is discarded.

  \+
  \emph{3.} Surviving Resources are sorted according to the value of the
  \texttt{Rank} expression in the Agent's ClassAd, and their list is
  returned to the Agent.
\end{frame}


\begin{frame}[fragile]
  \frametitle{Example: Job ClassAd}
  Select 64-bit Linux hosts, and sort them preferring hosts with
  larger memory and CPU speed.
\begin{semiverbatim}
Requirements = Arch=="x86_64" && OpSys == "LINUX"
Rank         = TARGET.Memory + TARGET.Mips
\end{semiverbatim}

  \+
  Agent ClassAds play a role similar to job descriptions in ARC/gLite:
  specify the compatibility/resource requests.

  \begin{references}
    \url{http://research.cs.wisc.edu/condor/manual/v6.4/4_1Condor_s_ClassAd.html}
  \end{references}
\end{frame}


\begin{frame}[fragile]
  \frametitle{Example: Resource ClassAd}
  A complex access policy, giving priority to users from the owner
  research group, then other ``friend'' users, and then the rest\ldots
\begin{semiverbatim}
Friend        = Owner == "tannenba" 
ResearchGroup = (Owner == "jbasney" 
                     || Owner == "raman")
Trusted       = Owner != "rival" 
Requirements  = Trusted && ( ResearchGroup 
                     || LoadAvg < 0.3 &&
                     KeyboardIdle > 15*60 )
Rank          = Friend + ResearchGroup*10
\end{semiverbatim}

  \+
  Resource ClassAds specify an \emph{access/usage policy} for the
  resource.
\end{frame}


\begin{frame}
  \frametitle{ClassAd wrap-up}
  ClassAds provide an extensible mechanism for describing resources
  and requirements:
  \begin{enumerate}
  \item A set of ``standard'' ClassAd values is provided by Condor itself;
  \item New values can be defined by the user (both client- and server-side).
  \end{enumerate}

  \+
  How can you submit a job that requires 200GB of local scratch space?
  Or 16 cores in a single node?

  \+
  Providing the right attributes for the match is now a
  \emph{organizational} problem, not a technical one.
\end{frame}



\section{The End}


\begin{frame}
  \begin{center}
    All these job management systems are based on a \emph{push model}
    (you send the job to an execution cluster).

    \+ Is there conversely a \emph{pull} model?
  \end{center}
\end{frame}


\section{References}

\begin{frame}[fragile]
  \frametitle{References}
  \begin{enumerate}%{[1]}

  \item Foster,~I. (2002): ``What is the Grid? A Three Point Checklist.'',
    \emph{Grid Today}, July~20, 2002
    {\small\url{http://dlib.cs.odu.edu/WhatIsTheGrid.pdf}}

  \item Thain,~D., Tannenbaum,~T. and Livny,~M. (2005): ``Distributed
    computing in practice: the Condor experience.'' \emph{Concurrency
      and Computation: Practice and Experience}, 17: 323--356. DOI:~10.1002/cpe.938

  % \item Buyya, R., Venugopal, S. (2005): ``A Gentle Introduction to
  %   Grid Computing and Technologies.'' \emph{Computer Society of India
  %   Communications}, July 2005, pp.~9--19, 
  %   \url{http://www.buyya.com/papers/GridIntro-CSI2005.pdf}

  \item Kónya, B. (2010): ``The ARC Information System'',
    {\small\url{http://www.nordugrid.org/documents/arc_infosys.pdf}}

  % \item Andreetto,~P. et~al. (2008): ``The gLite workload management
  %   system'', \emph{Journal of Physics: Conference Series}, 119 062007.
  % \end{enumerate}

  \item Cecchi,~M. et~al. (2009): ``The gLite Workload Management
    System'', \emph{Lecture Notes in Computer Science}, 5529/2009,
    pp.~256--268.

  \item Andreozzi,~S. et~al. (2009): ``GLUE Specification v.~2.0'',
    {\small\url{http://www.ogf.org/documents/GFD.147.pdf}}
  \end{enumerate}
\end{frame}

\section{Additional material}

\begin{frame}
  \begin{center}
    Average ping RTT to some SMSCG clusters.
    
    \+
    \begin{tabular}{lr}
      \textbf{cluster}               & \textbf{time (ms)}
      \\
      \texttt{idgc3grid01.uzh.ch}    & 0.533 \\
      \texttt{hera.wsl.ch}           & 1.110 \\
      \texttt{arc01.lcg.cscs.ch}     & 3.146 \\
      \texttt{smscg.epfl.ch}         & 3.917 \\
      \texttt{gordias.unige.ch}      & 5.638 \\
    \end{tabular}
  \end{center}
\end{frame}

\begin{frame}
  \begin{center}
    Time to retrieve a single LDAP entry
    
    \+
    \begin{tabular}{lrr}
      \textbf{cluster}               & \textbf{time (ms)} & connect time (ms) 
      \\
      \texttt{smscg.epfl.ch}         & 142                & 108 \\
      \texttt{gordias.unige.ch}      & 142                & 127 \\
      \texttt{idgc3grid01.uzh.ch}    & 147                & 39  \\
      \texttt{arc01.lcg.cscs.ch}     & 170                & 114 \\
      \texttt{hera.wsl.ch}           & 344                & 117 \\
    \end{tabular}
  \end{center}
\end{frame}


\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: 

