@TechReport{lofstead:2009:adios-tr,
  author = {Gerald F. Lofstead II and Karsten Schwan and Scott Klasky and Ron
  A. Oldfield},
  title = {Advanced {I/O} for Large-Scale Scientific Applications},
  year = {2009},
  month = {December},
  number = {SAND2009-7763},
  institution = {Sandia National Laboratories},
  address = {Albuquerque, NM},
  keywords = {ADIOS, perforamance analysis, application programmer interface,
  pario-bib},
  abstract = {As scientific simulations scale to use petascale machines and
  beyond, the data volumes gen- erated pose a dual problem. First, with
  increasing machine sizes, the careful tuning of IO routines becomes more and
  more important to keep the time spent in IO acceptable. It is not uncommon,
  for instance, to have 20% of an application's runtime spent performing IO in
  a `tuned' system. Careful management of the IO routines can move that to 5%
  or even less in some cases. Second, the data volumes are so large, on the
  order of 10s to 100s of TB, that trying to discover the scientifically valid
  contributions requires assistance at runtime to both organize and annotate
  the data. Waiting for offline processing is not feasible due both to the
  impact on the IO system and the time required. To reduce this load and
  improve the ability of scientists to use the large amounts of data being
  produced, new techniques for data management are required. First, there is a
  need for techniques for efficient movement of data from the compute space to
  storage. These techniques should understand the underlying system
  infrastructure and adapt to changing system conditions. Technologies include
  aggregation networks, data staging nodes for a closer parity to the IO
  subsystem, and autonomic IO routines that can detect system bottlenecks and
  choose different approaches, such as splitting the output into multiple
  targets, staggering output processes. Such methods must be end-to-end,
  meaning that even with properly managed asynchronous techniques, it is still
  essential to properly manage the later synchronous interaction with the
  storage system to maintain acceptable performance. Second, for the data being
  generated, annotations and other metadata must be incorporated to help the
  scientist understand output data for the simulation run as a whole, to select
  data and data features without concern for what files or other storage
  technologies were employed. All of these features should be attained while
  maintaining a simple deployment for the science code and eliminating the need
  for allocation of additional computational resources.}
}