@TechReport{ferreira:2009:rmpi-tr,
  author = {Kurt Ferreira and Rolf Riesen and Ron Oldfield and Jon Stearley and
  James Laros and Kevin Pedretti and Ron Brightwell and Todd Kordenbrock},
  title = {Increasing Fault Resiliency in a Message-Passing Environment},
  year = {2009},
  month = {October},
  number = {SAND2009-6753},
  institution = {Sandia National Laboratories},
  keywords = {redundancy, resilience},
  abstract = {Petaflops systems will have tens to hundreds of thousands of
  compute nodes which increases the likelihood of faults. Applications use
  checkpoint/restart to recover from these faults, but even under ideal
  conditions, applications running on more than 30,000 nodes will likely spend
  more than half of their total run time saving checkpoints, restarting, and
  redoing work that was lost. We created a library that performs redundant
  computations on additional nodes allocated to the application. An active node
  and its redundant partner form a node bundle which will only fail, and cause
  an application restart, when both nodes in the bundle fail. The goal of this
  library is to learn whether this can be done entirely at the user level, what
  requirements this library places on a Reliability, Availability, and
  Serviceability (RAS) system, and what its impact on performance and run time
  is. We find that our redundant MPI layer library imposes a relatively modest
  performance penalty for applications, but that it greatly reduces the number
  of applications interrupts. This reduction in interrupts leads to huge
  savings in restart and rework time. For large-scale applications the savings
  compensate for the performance loss and the additional nodes required for
  redundant computations.}
}