@TechReport{oldfield:2010:ft-ldrd-tr, author = {Ron A. Oldfield}, title = {Lightweight Storage and Overlay Networks for Fault Tolerance}, year = {2010}, month = {January}, number = {SAND2010-0040}, institution = {Sandia National Laboratories}, address = {Albuquerque, NM}, note = {LDRD Final Report}, keywords = {checkpoint, overlay network, analytic modeling, LWFS, pario-bib}, abstract = {The next generation of capability-class, massively parallel processing (MPP) systems is ex- pected to have hundreds of thousands to millions of processors, In such environments, it is critical to have fault-tolerance mechanisms, including checkpoint/restart, that scale with the size of appli- cations and the percentage of the system on which the applications execute. For application-driven, periodic checkpoint operations, the state-of-the-art does not provide a scalable solution. For ex- ample, on today's massive-scale systems that execute applications which consume most of the memory of the employed compute nodes, checkpoint operations generate I/O that consumes nearly 80% of the total I/O usage. Motivated by this observation, this project aims to improve I/O per- formance for application-directed checkpoints through the use of lightweight storage architectures and overlay networks. Lightweight storage provide direct access to underlying storage devices. Overlay networks provide caching and processing capabilities in the compute-node fabric. The combination has potential to signifcantly reduce I/O overhead for large-scale applications. This report describes our combined efforts to model and understand overheads for application-directed checkpoints, as well as implementation and performance analysis of a checkpoint service that uses available compute nodes as a network cache for checkpoint operations.} }