@TechReport{oldfield:2010:ft-ldrd-tr,
  author = {Ron A. Oldfield},
  title = {Lightweight Storage and Overlay Networks for Fault Tolerance},
  year = {2010},
  month = {January},
  number = {SAND2010-0040},
  institution = {Sandia National Laboratories},
  address = {Albuquerque, NM},
  note = {LDRD Final Report},
  keywords = {checkpoint, overlay network, analytic modeling, LWFS, pario-bib},
  abstract = {The next generation of capability-class, massively parallel
  processing (MPP) systems is ex- pected to have hundreds of thousands to
  millions of processors, In such environments, it is critical to have
  fault-tolerance mechanisms, including checkpoint/restart, that scale with the
  size of appli- cations and the percentage of the system on which the
  applications execute. For application-driven, periodic checkpoint operations,
  the state-of-the-art does not provide a scalable solution. For ex- ample, on
  today's massive-scale systems that execute applications which consume most of
  the memory of the employed compute nodes, checkpoint operations generate I/O
  that consumes nearly 80% of the total I/O usage. Motivated by this
  observation, this project aims to improve I/O per- formance for
  application-directed checkpoints through the use of lightweight storage
  architectures and overlay networks. Lightweight storage provide direct access
  to underlying storage devices. Overlay networks provide caching and
  processing capabilities in the compute-node fabric. The combination has
  potential to signifcantly reduce I/O overhead for large-scale applications.
  This report describes our combined efforts to model and understand overheads
  for application-directed checkpoints, as well as implementation and
  performance analysis of a checkpoint service that uses available compute
  nodes as a network cache for checkpoint operations.}
}