@InProceedings{oldfield:2007:modeling_checkpoints,
  author = {Ron A. Oldfield and Sarala Arunagiri and Patricia J. Teller and
  Seetharami Seelam and Rolf Riesen and Maria Ruiz Varela and Philip C. Roth},
  title = {Modeling the Impact of Checkpoints on Next-Generation Systems},
  booktitle = {Proceedings of the 24th IEEE Conference on Mass Storage Systems
  and Technologies},
  year = {2007},
  month = {September},
  address = {San Diego, CA},
  DOI = {10.1109/MSST.2007.4367962},
  URL = {http://dx.doi.org/10.1109/MSST.2007.4367962},
  keywords = {performance modeling, optimal checkpoint interval, I/O
  performance, fault-tolerance, checkpointing, LWFS, pario-bib},
  abstract = {The next generation of capability-class, massively parallel
  processing (MPP) systems is expected to have hundreds of thousands of
  processors. For application-driven, periodic checkpoint operations, the
  state-of-the-art does not provide a solution that scales to next-generation
  systems. We demonstrate this by using mathematical modeling to compute a
  lower bound of the impact of these approaches on the performance of
  applications executed on three massive-scale, in-production, DOE systems and
  a theoretical petaflop system. We also adapt the model to investigate a
  proposed optimization that makes use of ``lightweight'' storage architectures
  and overlay networks to overcome the storage system bottleneck. Our results
  indicate that (1) as we approach the scale of next-generation systems,
  traditional checkpoint/restart approaches will increasingly impact
  application performance, accounting for over 50\% of total application
  execution time; (2) although our alternative approach improves performance,
  it has limitations of its own; and (3) there is a critical need for new
  approaches to checkpoint/restart that allow continuous computing with minimal
  impact on the scalability of applications.}
}