\documentclass[english,serif,mathserif,xcolor=pdftex,dvipsnames,table]{beamer}
\usepackage{lsci}
\usepackage{epigraph}

\title[Virtualization]{%
  Virtualization, Grids, \\
  and an introduction to AppPot
}
\author[R. Murri]{%
  \textbf{Riccardo Murri} \\
  Grid Computing Competence Center, \\
  Organisch-Chemisches Institut, \\
  University of Zurich
}
\date{Nov.~16,~2011}


\begin{document}
% title frame
\maketitle

\section{Introduction}

\begin{frame}
  \frametitle{Today's class}

  What's virtualization and why it is important in a Grid setting.
  
  \+
  Virtualization techniques on the x86 family processors, and ``User
  Mode Linux'' in particular.

  \+
  Introduction to the AppPot virtual image.

  \+
  {\small These slides are available for download from: 
    \url{http://www.gc3.uzh.ch/teaching/lsci2011/lecture11.pdf}}
\end{frame}

\section{What is virtualization}

\begin{frame}
  \frametitle{What is virtualization?}

  ``A virtual machine is taken to be an \emph{efficient, isolated
    duplicate} of a real machine.''

  \begin{references}\footnotesize{}
    G. J. Popek and R. P. Goldberg (1974): 
    ``Formal Requirements for
    Virtualizable Third Generation Architectures'' \\
    \emph{Communications of the ACM} 17 (7): 412--421.
  \end{references}
\end{frame}

\begin{frame}
  \frametitle{Virtualization characteristics}
  \begin{tabular}{>{\bfseries\raggedleft}p{0.25\linewidth}>{\raggedright\small}p{0.75\linewidth}}
  {Fidelity}  &
  Software on the VM exhibits ``essentially
  identical'' effects to its execution on hardware.
  \\[1em]
  {Efficiency} &
  ``A statistically dominant subset of the
  virtual process instructions are executed directly by the real
  processor, with no software intervention.''
  \\[1em]
  {Resource control} &
  The VM control software manages
  hardware allocation, ``it is not possible for a program running in
  the VM to access any resource not explicitly assigned to it.''
  \end{tabular}
\end{frame}


\begin{frame}
  \frametitle{%
    \only<1>{Why virtualization?}
    \only<2>{Why virtualization \alert{in Grid infrastructures}?}
    \only<3>{(Why do \alert{IaaS clouds} use virtualization?)}
  }
  Execution environments with resource limits and/or resource guarantees. 
  \begin{itemize}
  \item \alert<2>{Provide secure, isolated sandboxes for running untrusted
    applications.}
  \end{itemize}

  \+ \alert<3>{Make systems independent of the hardware.}
  \begin{itemize}
  \item \alert<2>{Run legacy applications.}
  \item \alert<2>{Provide binary compatibility.}
  \end{itemize}

  \+ \alert<3>{Co-locate and consolidate independent workloads.}
  \begin{itemize}
  \item Run multiple operating systems.
  \end{itemize}

  \+ \alert<2>{Treat application suites as appliances by ``packaging'' and
    running each in a virtual machine.}
\end{frame}


\section{Virtualization techniques}

% virtualization today http://en.wikipedia.org/wiki/X86_virtualization
% - classical virtualization

\subsection{Classical virtualization}

\begin{frame}
  \frametitle{Classical virtualization, I}
  \emph{\small (Popek and Goldberg's model of a ``third-generation'' machine.)}

  \+ 
  The processor has two modes of operation: \textbf{supervisor
    mode} and \textbf{user mode}.  Only a \emph{subset} of all machine
  instructions is available in user mode.

  \+
  Memory addressing is:
  \begin{itemize}
  \item \emph{relative} to a relocation register
  \item \emph{limited} by a bounds register
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Classical virtualization, II}
  An instruction is \textbf{privileged} iff it can only be executed in
  supervisor mode.

  \+
  An instruction is \textbf{control sensitive} iff it attempts to alter
  the configuration of machine resources (e.g., change the memory
  bounds register).

  \+
  An instruction is \textbf{behavior sensitive} iff its effect depends
  on the configuration of resources (e.g., the memory bounds or
  relocation register).
\end{frame}


\begin{frame}
  \frametitle{Classical virtualization, III}
  An instruction is \textbf{trapping} if its execution causes a jump to
  supervisor code at a fixed location.

  \+
  {Theorem.} 
  {\bfseries If all sensitive instructions are trapping, 
    then a machine can be virtualized.}

  \+
  Basic idea: execute all instructions in user mode; the sensitive
  ones will trap and special supervisor code (the \emph{VMM} or
  \emph{control program}) can emulate the requested behavior.

  \+
  \emph{Note:} Popek and Goldberg's Theorem is a \emph{sufficient}
  condition, but not a \emph{necessary} one.
\end{frame}


\begin{frame}
  \frametitle{x86 is not classically virtualizable}
  There are sensitive but \emph{non-trapping} instructions.
  \begin{center}
    \includegraphics[height=0.8\textheight,angle=270]{lecture11/hotchips-page23}
  \end{center}

  \begin{references}
    \footnotesize{}
    J. E. Smith and R. Uhlig (2005), \\
  \emph{Virtual Machines: Architectures, Implementations and
    Applications},
  \url{http://www.hotchips.org/archives/hc17/1_Sun/HC17.T1P2.pdf}
  \end{references}
\end{frame}


\subsection{Binary translation}

\begin{frame}
  \frametitle{Binary Translation}
  Modify OS code as it runs: \emph{replace} problematic instructions
  with ``virtualization correct'' code.

  \+
  Extremely complex in practice.

  \+
  Successfully implemented by VMware at the end of the
  1990's.

  \begin{references}
    \footnotesize{}
    K. Adams and O. Agesen (2006),
    \emph{A comparison of software and hardware techniques for x86
      virtualization},
    \url{http://www.vmware.com/pdf/asplos235_adams.pdf}
  \end{references}
\end{frame}

\subsection{Hardware-assisted virtualization}

% - hw virtualization:
%   - special instruction sets
%     - AMD-V (Linux cpuinfo flag: svm)
%     - Intel VT-x (Linux cpuinfo flag: vmx)
%   - KVM
\begin{frame}
  \frametitle{Hardware-assisted virtualization}
  Around 2005, Intel and AMD introduced extensions to the x86
  instruction set to enable classical virtualization.

  \+ The CPU has an \emph{additional} protection bit, selecting
  \textbf{host} or \textbf{guest} mode.

  \+
  Code in \emph{host supervisor} mode can set which instructions in
  \emph{guest supervisor} mode should trap and on which occasion.

  \+
  The virtualized OS runs in \emph{guest supervisor} mode, and will
  not notice any difference with a non-virtualized CPU.
\end{frame}


\subsection{Paravirtualization}

\begin{frame}
  \frametitle{Paravirtualization}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      \vspace{9em}
      \includegraphics[viewport=175 750 445 425,clip,height=\linewidth,angle=90]{lecture11/hotchips-page16}
    \end{column}
    \begin{column}{0.5\textwidth}
      Paravirtualization relies on a \emph{modified} guest OS:
      forward hardware access to host OS.
      
      \+ Notable examples in Linux are \href{http://xen.org}{Xen} and
      \href{http://user-mode-linux.sf.net}{User-Mode Linux}.
    \end{column}
  \end{columns}
  
  \+
  {\footnotesize
    (Diagram stolen from Smith and Uhlig's presentation
    \url{http://www.hotchips.org/archives/hc17/1_Sun/HC17.T1P2.pdf})}
\end{frame}


\section{User-Mode Linux}

\begin{frame}
  \frametitle{What is User-Mode Linux?}
  \href{http://user-mode-linux.sf.net/}{User-mode Linux} is a Linux
  paravirtualization system, \emph{running entirely in
    user-space}.

  \+
  UML consists of a modified Linux kernel \emph{(guest)}, that runs as a
  process within another Linux system \emph{(host)}.
  
  \+
  Other than that, it's a regular kernel, so it can run \emph{any} Linux
  distribution with \emph{any} configuration.
\end{frame}


\begin{frame}
  \frametitle{UML features, I}
  Any file in the host system can be a block device (\emph{ubdX}).
  Uses \emph{copy-on-write}, so one filesystem image can be used by many
  UMLx instances concurrently.

  \+
  Can mount any directory in the host filesystem as a local
  \emph{hostfs} filesystem.
  
  \+
  Outbound net connectivity with a helper program (\emph{slirp}).
  
  \+
  Local networks of UML instances, backing on IP multicast.
\end{frame}


\begin{frame}
  \frametitle{UML features, II}
  Deployment advantages:
  \begin{itemize}
  \item The whole virtualization system is \emph{just one Linux program}.
  \item It can be compiled statically.
  \item Linux system images are just files.
  \item Runs entirely in userspace.
  \end{itemize}

  \+
  So deployment is basically: download and run!
\end{frame}

\begin{frame}
  \frametitle{UML architecture, I}
  The UML kernel runs in its own process, and spawns a child
  \emph{host} process for every process that is started in the
  \emph{guest}.

  \+
  The whole UML memory is allocated as \texttt{mmap()}'ed
  pages from a temporary file:
  \begin{itemize}
  \item the UML kernel creates a file the size of the requested memory;
  \item when it has to satisfy a memory allocation it
    just \texttt{mmap()}'s a page from that file.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{UML syscall handling, I}
  The UML kernel process \texttt{ptrace()}'s its child processes in
  order to trap system calls.

  \+
  \emph{1.} a guest process decides to make a system call, e.g., \texttt{mprotect()}
  to be able to write to the memory area where UML installed its hooks;
  
  \+ \emph{2.} the \texttt{SYSENTER} assembly instruction triggers a
  \texttt{ptrace} event: the guest process is stopped (by the \emph{host}
  kernel) and the UML \emph{guest} kernel process is notified via
  \texttt{SIGTRAP}.
\end{frame}

\begin{frame}
  \frametitle{UML syscall handling, II}
  \+ \emph{3.} the UML guest kernel process inspects the syscall
  arguments, translates them into an equivalent host syscall, and
  then "patches" the result back into the guest process memory.

  \+
  \emph{4.} the UML guest kernel sends a SIGCONT to the UML guest
  process, which continues execution.
\end{frame}


\section{AppPot}

\begin{frame}
  \frametitle{Use cases for UML in Grid infrastructures}
  Two basic problems:
  \begin{itemize}
  \item Deploying complex applications at Grid sites
  \item Running own-written code on a Grid
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Example: GAMESS(US)}
  
  \href{http://www.msg.ameslab.gov/gamess/}{GAMESS(US)} is a program
  for ab initio molecular quantum chemistry.

  \+
  \emph{Legacy build system, consisting in a number of C-shell scripts,
  FORTRAN code preprocessors (written themselves in FORTRAN),
  hand-editing files and much patience.}

  \+
  It's easy to get an incorrect build.

  \+
  And there's no validation suite, so you will only find out when
  users complain.
\end{frame}

\begin{frame}
  \frametitle{Problem: Complex application deployment}
  \textbf{Many scientific codes require experience to be built properly.}

  \+ This takes up precious sysadmin time.

  \+ Worse, it can lead to incorrect deployment of applications.

  \+ \emph{Worst}, on a large scale wasted sysadmin time is multiplied
  by a factor of \emph{N}
\end{frame}

\begin{frame}
  \frametitle{Example: Alpine 3D}
  \label{sec:5}
  
  Code developed by \href{http://www.wsl.ch/}{WSL} for
  \href{http://www.slf.ch/ueber/organisation/schnee_permafrost/projekte/Alpine/index_EN}
  {high resolution simulation of alpine surface processes}.
  
  \+
  Depends on the large
  \href{http://www.slideshare.net/GEOFRAMEcafe/meteo-io-introduction}{MeteoIO}
  library, plus some not-so-standard build procedure.
  
  \+
  Alpine3D is an ongoing development effort.  Developers need to test
  and validate new part of Alpine3D with some well-defined simulation
  suites.

  \+
  Even simple simulations can take hours. \textbf{How can we support
    running \emph{modified} Alpine3D on a Grid?}
\end{frame}

\begin{frame}
  \frametitle{Problem: Running own-written code on a Grid}
   Users are actively working on a code, e.g., by implementing new
   algorithms.

   \+
   They can run basic tests on their laptop, but need more computing
   power for validation suites, etc.

   \+
   \textbf{Running on a heterogeneous grid poses interoperability and deployment problems}, 
   that users should not waste their time on.
\end{frame}


\begin{frame}
  \frametitle{AppPot}
  \begin{center}
    \Large
    Idea: 
    \\
    \textbf{Run a UML machine as a Grid job.}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{So what is AppPot?}
  AppPot consists of:
  \begin{itemize}
  \item a base image (with the AppPot boot script)
  \item a startup script \texttt{apppot-start}
  \item support programs \texttt{linux}, \texttt{slirp}, \texttt{empty}
  \item (Plus an ARC RTE specifing the location of all the above.)
  \end{itemize}

  \+
  You can run an AppPot UML machine either locally on your computer,
  or \emph{on a Grid through ARC}.
\end{frame}

\begin{frame}
  \frametitle{How does it help with\ldots{} \emph{(I)}}
  \label{sec:10}
  
  \emph{Complex application deployment?}
  
  \+
  An application expert creates an AppPot base image with the
  software correctly installed and validated.

  \+
  \textbf{Sysadmins just need to copy it into a file on the cluster.}
\end{frame}

\begin{frame}
  \frametitle{How does it help with\ldots{} \emph{(II)}}
  \label{sec:11}
  
  \emph{Running own code on a Grid?}

  \+
  A knowledgeable sysadmin creates a \emph{base image}.
  
  \+
  Users get a copy of the base image, install their code in it and do
  the development work.
  
  \+
  When they want to do a larger run, they \textbf{just run the modified image
  as a Grid job.}
\end{frame}

\begin{frame}
  \begin{center}
    \begin{Large}
      Can you foresee any issues here?
    \end{Large}
  \end{center}
\end{frame}
\begin{frame}
  \frametitle{\ldots{}\href{http://www.originalpoetry.com/the-storm_4}{can see the storm coming in the distance}}
  \label{sec:12}
  
  \begin{enumerate}
  \item How do I specify which command to run, exactly?
  \item Do I need to create a different filesystem image for each set of data?
  \item The filesystem image is \emph{large}, I cannot submit a 100 jobs with 1GB payload each!
  \item How do I get the output out of the UML?
  \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{How do I specify which command to run?}
  \label{sec:13}
  
  AppPot comes with a standardized startup script, and a modified
  Linux boot script (to be installed as \texttt{/etc/rc.local}).
  
  Supports 4 different ways of executing a command:
  \begin{itemize}
  \item Give it on the command-line of \texttt{apppot-start.sh},
  \item or provide an \texttt{apppot-run} script in the current directory,
  \item or provide an \texttt{apppot-autorun} script \emph{within the AppPot image},
  \item else, starts a shell on the main console and you just type it.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Do I need to create a different filesystem image for each set of data?}
  \label{sec:14}
  
  AppPot automatically mounts the current directory on \texttt{/home/user/job}.  

  \+
  You can read and write files on to the host filesystem through that.

  \+
  In ARC terms, the \emph{sessiondir} is mounted on \texttt{/home/user/job}.
  So you have access to all the files you specify as \texttt{inputFiles} and
  \texttt{outputFiles} in the xRSL.
\end{frame}

\begin{frame}
  \frametitle{The filesystem image is too large}
  \label{sec:15}
  AppPot supports a \emph{base} + \emph{changes} mechanism.
  
  \+
  The command \texttt{apppot-snap base} records a snapshot of the current
  system state (file sizes, timestamps, etc.).  This should be used by
  sysadmins / application experts when they are done preparing the
  base filesystem image.

  \+
  The command \texttt{apppot-snap changes} creates a tarball with all the
  modifications since the last recorded base.

  \+
  Users only submit the changes, the startup script automatically
  merges them into the running UML instance.
\end{frame}

\begin{frame}
  \frametitle{How do I get the output out of the UML?}
  \label{sec:16}
  Did I tell you the \emph{sessiondir} is mounted on \texttt{/home/user/job}? Just
  copy your output there.
  
  \+
  As for STDIN and STDOUT, the \texttt{apppot-start} script takes care of
  redirecting them to the where ARC requests.
\end{frame}

\begin{frame}
  \frametitle{Example: GAMESS}
  \label{sec:18}
  Base image contains GAMESS, correctly installed for Debian.

  \+
  Startup script runs GAMESS on the input files specified by the user.
  
  \+
  No visible difference between calling this GAMESS appliance and the
  real application.
\end{frame}

% \begin{frame}
%   \frametitle{Example: GAMESS development}
%   \label{sec:19}

%   Base image contains GAMESS sources in \texttt{/home/user/gamess},
%   configured for compilation on Debian.  All needed development tools
%   installed. 
  
%   GAMESS developers can hack the sources, recompile at will and test
%   locally.
  
%   Then, they can send the \emph{changes} tarball as a Grid job to try out the
%   new version with some larger data set.
% \end{frame}

\begin{frame}
  \frametitle{Example: Alpine 3D}
  Usage model:
  \begin{itemize}
  \item Developers commit new code of the Alpine3D to SVN.
  \item They launch the validation suite:
    \begin{enumerate}
    \item create tarball from SVN repo (\texttt{A3D{\_}source.tgz})
    \item simulation suites already available on Grid storage
    \item submit \emph{A3D{\_}uml.xrsl} job, using \emph{A3D} AppPot
      image available on SMSCG sites
    \item fetch results when job is done
    \end{enumerate}
  \end{itemize}
\end{frame}  

% \begin{frame}
%   \frametitle{Example: Alpine 3D}
%   \label{sec:17}
  
%   Specialized base image with a predefined startup script \emph{inside}:
%   \begin{itemize}
%   \item expects a tarball with the (modified) sources to be available in
%     the sessiondir
%   \item expects simulation data to be available there as well
%   \item compiles the sources and run the simulation with it, reporting on
%     results.
%   \end{itemize}
  
%   Intended as a base image to be available at Grid sites, for \emph{both:}
%   \begin{itemize}
%   \item Running a simulation with unmodified Alpine 3D code (for A3D users)
%   \item Running tests of customized A3D code (for developers)
%   \end{itemize}
% \end{frame}

\begin{frame}
  \frametitle{Issues}
  Not as sysamin neutral as expected:
  \begin{itemize}
  \item Unusual and heavy use of \texttt{mmap()} requires tweaking the
    Linux default parameters.
  \item Shared memory allocation defaults to \texttt{/dev/shm}
    filesystem, which limits the amount of memory of the guests to
    half of what available in the host.
  \end{itemize}

  \+
  Batch systems assume a ``no shared'' memory process, which leads
  to miscomputation of the memory used by UML. (So the sysadmin needs
  to turn off memory limits for AppPot processes.)

\end{frame}

% \begin{frame}
%   \frametitle{Future work \emph{(I)}}
%   \label{sec:21}
%   \textbf{Support MPI.}  Two issues:

%   \begin{itemize}
%   \item UML-to-UML networking backs on IP multicast. Does the cluster
%     network fabric support it?
%   \item The startup script has to extract host allocation details
%     for each and every batch system / MPI library combination.
%   \end{itemize}
% \end{frame}



\section{The End}

\begin{frame}
  \frametitle{Any questions?}
  \label{sec:23}
  \begin{center}
    AppPot home page: \href{http://AppPot.googlecode.com}{http://AppPot.googlecode.com}

    \+
    Source code: \texttt{svn co http://AppPot.googlecode.com/svn}
    
    \+
    e-mail: \texttt{riccardo.murri@gmail.com}, \texttt{sergio.maffioletti@gc3.uzh.ch}
    
    \+
    \textbf{Thank you!}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{References}

  \+
  {\small A commented list of online resources on the course Wiki:
    \url{http://www.gc3.uzh.ch/teaching/lsci2011/lecture11.html}}
\end{frame}

\end{document}


%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: 

