@Preamble{
"\ifx \undefined \circled \def \circled #1{(#1)}\fi" #
"\ifx \undefined \reg \def \reg {\circled{R}}\fi" #
"\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
@String{pub-ACM = "ACM Press"}
@String{pub-ACM:adr = "New York, NY 10036, USA"}
@String{pub-IEEE = "IEEE Computer Society Press"}
@String{pub-IEEE:adr = "1109 Spring Street, Suite 300,
Silver Spring, MD 20910, USA"}
@InProceedings{Chhugani:2012:BPS,
author = "Jatin Chhugani and Changkyu Kim and Hemant Shukla and
Jongsoo Park and Pradeep Dubey and John Shalf and Horst
D. Simon",
title = "Billion-particle {SIMD}-friendly two-point correlation
on large-scale {HPC} cluster systems",
crossref = "Hollingsworth:2012:SPI",
pages = "1:1--1:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a002.pdf",
abstract = "Two-point Correlation Function (TPCF) is widely used
in astronomy to characterize the distribution of
matter/energy in the Universe, and help derive the
physics that can trace back to the creation of the
universe. However, it is prohibitively slow for current
sized datasets, and would continue to be a critical
bottleneck with the trend of increasing dataset sizes
to billions of particles and more, which makes TPCF a
compelling benchmark application for future exa-scale
architectures. State-of-the-art TPCF implementations do
not map well to the underlying SIMD hardware, and also
suffer from load-imbalance for large core counts. In
this paper, we present a novel SIMD-friendly histogram
update algorithm that exploits the spatial locality of
histogram updates to achieve near-linear SIMD scaling.
We also present a load-balancing scheme that combines
domain-specific initial static division of work and
dynamic task migration across nodes to effectively
balance computation across nodes. Using Zin
supercomputer at Lawrence Livermore National Laboratory
(25,600 cores of Intel\reg{} Xeon\reg{} E5-2670, each
with 256-bit SIMD), we achieve 90\% parallel efficiency
and 96\% SIMD efficiency, and perform TPCF computation
on a 1.7 billion particle dataset in 5.3 hours (at
least 35 x faster than previous approaches). In terms
of cost per performance (measured in flops/\$), we
achieve at least an order-of-magnitude (11.1 x) higher
flops/\$ as compared to the best known results [1].
Consequently, we now have line-of-sight to achieving
the processing power for correlation computation to
process billion+ particles telescopic data.",
acknowledgement = ack-nhfb,
articleno = "1",
}
@InProceedings{Mirin:2012:TRT,
author = "Arthur A. Mirin and David F. Richards and James N.
Glosli and Erik W. Draeger and Bor Chan and Jean-luc
Fattebert and William D. Krauss and Tomas Oppelstrup
and John Jeremy Rice and John A. Gunnels and
Viatcheslav Gurev and Changhoan Kim and John Magerlein
and Matthias Reumann and Hui-Fang Wen",
title = "Toward real-time modeling of human heart ventricles at
cellular resolution: simulation of drug-induced
arrhythmias",
crossref = "Hollingsworth:2012:SPI",
pages = "2:1--2:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a001.pdf",
abstract = "We have developed a highly efficient and scalable
cardiac electrophysiology simulation capability that
supports groundbreaking resolution and detail to
elucidate the mechanisms of sudden cardiac death from
arrhythmia. We can simulate thousands of heartbeats at
a resolution of 0.1 mm, comparable to the size of
cardiac cells, thereby enabling scientific inquiry not
previously possible. Based on scaling results from the
partially deployed Sequoia IBM Blue Gene/Q machine at
Lawrence Livermore National Laboratory and planned
optimizations, we estimate that by SC12 we will
simulate 8--10 heartbeats per minute --- a
time-to-solution 400--500 times faster than the
state-of-the-art. Performance between 8 and 11 PFlop/s
on the full 1,572,864 cores is anticipated,
representing 40--55 percent of peak. The power of the
model is demonstrated by illuminating the subtle
arrhythmogenic mechanisms of anti-arrhythmic drugs that
paradoxically increase arrhythmias in some patient
populations.",
acknowledgement = ack-nhfb,
articleno = "2",
}
@InProceedings{Bui-Thanh:2012:ESU,
author = "Tan Bui-Thanh and Carsten Burstedde and Omar Ghattas
and James Martin and Georg Stadler and Lucas C.
Wilcox",
title = "Extreme-scale {UQ} for {Bayesian} inverse problems
governed by {PDEs}",
crossref = "Hollingsworth:2012:SPI",
pages = "3:1--3:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a003.pdf",
abstract = "Quantifying uncertainties in large-scale simulations
has emerged as the central challenge facing CS{\&}E.
When the simulations require supercomputers, and
uncertain parameter dimensions are large, conventional
UQ methods fail. Here we address uncertainty
quantification for large-scale inverse problems in a
Bayesian inference framework: given data and model
uncertainties, find the pdf describing parameter
uncertainties. To overcome the curse of dimensionality
of conventional methods, we exploit the fact that the
data are typically informative about low-dimensional
manifolds of parameter space to construct low rank
approximations of the covariance matrix of the
posterior pdf via a matrix-free randomized method. We
obtain a method that scales independently of the
forward problem dimension, the uncertain parameter
dimension, the data dimension, and the number of cores.
We apply the method to the Bayesian solution of an
inverse problem in 3D global seismic wave propagation
with over one million uncertain earth model parameters,
630 million wave propagation unknowns, on up to 262K
cores, for which we obtain a factor of over 2000
reduction in problem dimension. This makes UQ tractable
for the inverse problem.",
acknowledgement = ack-nhfb,
articleno = "3",
}
@InProceedings{Habib:2012:UES,
author = "Salman Habib and Vitali Morozov and Hal Finkel and
Adrian Pope and Katrin Heitmann and Kalyan Kumaran and
Tom Peterka and Joe Insley and David Daniel and
Patricia Fasel and Nicholas Frontiere and Zarija
Luki{\'c}",
title = "The universe at extreme scale: multi-petaflop sky
simulation on the {BG/Q}",
crossref = "Hollingsworth:2012:SPI",
pages = "4:1--4:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a004.pdf",
abstract = "Remarkable observational advances have established a
compelling cross-validated model of the Universe. Yet,
two key pillars of this model --- dark matter and dark
energy --- remain mysterious. Next-generation sky
surveys will map billions of galaxies to explore the
physics of the 'Dark Universe'. Science requirements
for these surveys demand simulations at extreme scales;
these will be delivered by the HACC (Hybrid/Hardware
Accelerated Cosmology Code) framework. HACC's novel
algorithmic structure allows tuning across diverse
architectures, including accelerated and multi-core
systems. On the IBM BG/Q, HACC attains unprecedented
scalable performance --- currently 6.23 PFlops at 62\%
of peak and 92\% parallel efficiency on 786,432 cores
(48 racks) --- at extreme problem sizes with up to
almost two trillion particles, larger than any
cosmological simulation yet performed. HACC simulations
at these scales will for the first time enable tracking
individual galaxies over the entire volume of a
cosmological survey.",
acknowledgement = ack-nhfb,
articleno = "4",
}
@InProceedings{Ishiyama:2012:PAB,
author = "Tomoaki Ishiyama and Keigo Nitadori and Junichiro
Makino",
title = "4.45 Pflops astrophysical {$N$}-body simulation on {K}
computer: the gravitational trillion-body problem",
crossref = "Hollingsworth:2012:SPI",
pages = "5:1--5:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a005.pdf",
abstract = "As an entry for the 2012 Gordon-Bell performance
prize, we report performance results of astrophysical N
-body simulations of one trillion particles performed
on the full system of K computer. This is the first
gravitational trillion-body simulation in the world. We
describe the scientific motivation, the numerical
algorithm, the parallelization strategy, and the
performance analysis. Unlike many previous Gordon-Bell
prize winners that used the tree algorithm for
astrophysical N -body simulations, we used the hybrid
TreePM method, for similar level of accuracy in which
the short-range force is calculated by the tree
algorithm, and the long-range force is solved by the
particle-mesh algorithm. We developed a highly-tuned
gravity kernel for short-range forces, and a novel
communication algorithm for long-range forces. The
average performance on 24576 and 82944 nodes of K
computer are 1.53 and 4.45 Pflops, which correspond to
49\% and 42\% of the peak speed.",
acknowledgement = ack-nhfb,
articleno = "5",
}
@InProceedings{Henschel:2012:DLW,
author = "Robert Henschel and Stephen Simms and David Hancock
and Scott Michael and Tom Johnson and Nathan Heald and
Thomas William and Donald Berry and Matt Allen and
Richard Knepper and Matthew Davy and Matthew Link and
Craig A. Stewart",
title = "Demonstrating {Lustre} over a {100Gbps} wide area
network of 3,500km",
crossref = "Hollingsworth:2012:SPI",
pages = "6:1--6:8",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a006.pdf",
abstract = "As part of the SCinet Research Sandbox at the
Supercomputing 2011 conference, Indiana University (IU)
demonstrated use of the Lustre high performance
parallel file system over a dedicated 100 Gbps wide
area network (WAN) spanning more than 3,500 km (2,175
mi). This demonstration functioned as a proof of
concept and provided an opportunity to study Lustre's
performance over a 100 Gbps WAN. To characterize the
performance of the network and file system, low level
iperf network tests, file system tests with the IOR
benchmark, and a suite of real-world applications
reading and writing to the file system were run over a
latency of 50.5 ms. In this article we describe the
configuration and constraints of the demonstration and
outline key findings.",
acknowledgement = ack-nhfb,
articleno = "6",
}
@InProceedings{Meister:2012:SDD,
author = "Dirk Meister and J{\"u}rgen Kaiser and Andre Brinkmann
and Toni Cortes and Michael Kuhn and Julian Kunkel",
title = "A study on data deduplication in {HPC} storage
systems",
crossref = "Hollingsworth:2012:SPI",
pages = "7:1--7:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a008.pdf",
abstract = "Deduplication is a storage saving technique that is
highly successful in enterprise backup environments. On
a file system, a single data block might be stored
multiple times across different files, for example,
multiple versions of a file might exist that are mostly
identical. With deduplication, this data replication is
localized and redundancy is removed --- by storing data
just once, all files that use identical regions refer
to the same unique data. The most common approach
splits file data into chunks and calculates a
cryptographic fingerprint for each chunk. By checking
if the fingerprint has already been stored, a chunk is
classified as redundant or unique. Only unique chunks
are stored. This paper presents the first study on the
potential of data deduplication in HPC centers, which
belong to the most demanding storage producers. We have
quantitatively assessed this potential for capacity
reduction for 4 data centers (BSC, DKRZ, RENCI, RWTH).
In contrast to previous deduplication studies focusing
mostly on backup data, we have analyzed over one PB
(1212 TB) of online file system data. The evaluation
shows that typically 20\% to 30\% of this online data
can be removed by applying data deduplication
techniques, peaking up to 70\% for some data sets. This
reduction can only be achieved by a subfile
deduplication approach, while approaches based on
whole-file comparisons only lead to small capacity
savings.",
acknowledgement = ack-nhfb,
articleno = "7",
}
@InProceedings{Xie:2012:COB,
author = "Bing Xie and Jeffrey Chase and David Dillow and Oleg
Drokin and Scott Klasky and Sarp Oral and Norbert
Podhorszki",
title = "Characterizing output bottlenecks in a supercomputer",
crossref = "Hollingsworth:2012:SPI",
pages = "8:1--8:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a007.pdf",
abstract = "Supercomputer I/O loads are often dominated by writes.
HPC (High Performance Computing) file systems are
designed to absorb these bursty outputs at high
bandwidth through massive parallelism. However, the
delivered write bandwidth often falls well below the
peak. This paper characterizes the data absorption
behavior of a center-wide shared Lustre parallel file
system on the Jaguar supercomputer. We use a
statistical methodology to address the challenges of
accurately measuring a shared machine under production
load and to obtain the distribution of bandwidth across
samples of compute nodes, storage targets, and time
intervals. We observe and quantify limitations from
competing traffic, contention on storage servers and
I/O routers, concurrency limitations in the client
compute node operating systems, and the impact of
variance (stragglers) on coupled output such as
striping. We then examine the implications of our
results for application performance and the design of
I/O middleware systems on shared supercomputers.",
acknowledgement = ack-nhfb,
articleno = "8",
}
@InProceedings{Mustafa:2012:PSL,
author = "Dheya Mustafa and Rudolf Eigenmann",
title = "Portable section-level tuning of compiler parallelized
applications",
crossref = "Hollingsworth:2012:SPI",
pages = "9:1--9:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a012.pdf",
abstract = "Automatic parallelization of sequential programs
combined with tuning is an alternative to manual
parallelization. This method has the potential to
substantially increase productivity and is thus of
critical importance for exploiting the increased
computational power of today's multicores. A key
difficulty is that parallelizing compilers are
generally unable to estimate the performance impact of
an optimization on a whole program or a program section
at compile time; hence, the ultimate performance
decision today rests with the developer. Building an
autotuning system to remedy this situation is not a
trivial task. This work presents a portable empirical
autotuning system that operates at program-section
granularity and partitions the compiler options into
groups that can be tuned independently. To our
knowledge, this is the first approach delivering an
autoparallelization system that ensures performance
improvements for nearly all programs, eliminating the
users' need to experiment with such tools to strive for
highest application performance.",
acknowledgement = ack-nhfb,
articleno = "9",
}
@InProceedings{Jordan:2012:MOA,
author = "Herbert Jordan and Peter Thoman and Juan J. Durillo
and Simone Pellegrini and Philipp Gschwandtner and
Thomas Fahringer and Hans Moritsch",
title = "A multi-objective auto-tuning framework for parallel
codes",
crossref = "Hollingsworth:2012:SPI",
pages = "10:1--10:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a013.pdf",
abstract = "In this paper we introduce a multi-objective
auto-tuning framework comprising compiler and runtime
components. Focusing on individual code regions, our
compiler uses a novel search technique to compute a set
of optimal solutions, which are encoded into a
multi-versioned executable. This enables the runtime
system to choose specifically tuned code versions when
dynamically adjusting to changing circumstances. We
demonstrate our method by tuning loop tiling in
cache-sensitive parallel programs, optimizing for both
runtime and efficiency. Our static optimizer finds
solutions matching or surpassing those determined by
exhaustively sampling the search space on a regular
grid, while using less than 4\% of the computational
effort on average. Additionally, we show that
parallelism-aware multi-versioning approaches like our
own gain a performance improvement of up to 70\% over
solutions tuned for only one specific number of
threads.",
acknowledgement = ack-nhfb,
articleno = "10",
}
@InProceedings{Christen:2012:PCH,
author = "Matthias Christen and Olaf Schenk and Yifeng Cui",
title = "{Patus} for convenient high-performance stencils:
evaluation in earthquake simulations",
crossref = "Hollingsworth:2012:SPI",
pages = "11:1--11:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a014.pdf",
abstract = "Patus is a code generation and auto-tuning framework
for stencil computations targeting modern multi and
many-core processors. The goals of the framework are
productivity and portability for achieving high
performance on the target platform. Its stencil
specification language allows the programmer to express
the computation in a concise way independently of
hardware architecture-specific details. Thus, it
increases the programmer productivity by removing the
need for manual low-level tuning. We illustrate the
impact of the stencil code generation in seismic
applications, for which both weak and strong scaling
are important. We evaluate the performance by focusing
on a scalable discretization of the wave equation and
testing complex simulation types of the AWP-ODC code to
aim at excellent parallel efficiency, preparing for
petascale 3-D earthquake calculations.",
acknowledgement = ack-nhfb,
articleno = "11",
}
@InProceedings{Beamer:2012:DOB,
author = "Scott Beamer and Krste Asanovi{\'c} and David
Patterson",
title = "Direction-optimizing breadth-first search",
crossref = "Hollingsworth:2012:SPI",
pages = "12:1--12:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a019.pdf",
abstract = "Breadth-First Search is an important kernel used by
many graph-processing applications. In many of these
emerging applications of BFS, such as analyzing social
networks, the input graphs are low-diameter and
scale-free. We propose a hybrid approach that is
advantageous for low-diameter graphs, which combines a
conventional top-down algorithm along with a novel
bottom-up algorithm. The bottom-up algorithm can
dramatically reduce the number of edges examined, which
in turn accelerates the search as a whole. On a
multi-socket server, our hybrid approach demonstrates
speedups of 3.3--7.8 on a range of standard synthetic
graphs and speedups of 2.4--4.6 on graphs from real
social networks when compared to a strong baseline. We
also typically double the performance of prior leading
shared memory (multicore and GPU) implementations.",
acknowledgement = ack-nhfb,
articleno = "12",
}
@InProceedings{Checconi:2012:BSS,
author = "Fabio Checconi and Fabrizio Petrini and Jeremiah
Willcock and Andrew Lumsdaine and Anamitra Roy
Choudhury and Yogish Sabharwal",
title = "Breaking the speed and scalability barriers for graph
exploration on distributed-memory machines",
crossref = "Hollingsworth:2012:SPI",
pages = "13:1--13:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a020.pdf",
abstract = "In this paper, we describe the challenges involved in
designing a family of highly-efficient Breadth-First
Search (BFS) algorithms and in optimizing these
algorithms on the latest two generations of Blue Gene
machines, Blue Gene/P and Blue Gene/Q. With our recent
winning Graph 500 submissions in November 2010, June
2011, and November 2011, we have achieved unprecedented
scalability results in both space and size. On Blue
Gene/P, we have been able to parallelize a scale 38
problem with 2$^{38}$ vertices and 2$^{42}$ edges on
131,072 processing cores. Using only four racks of an
experimental configuration of Blue Gene/Q, we have
achieved a processing rate of 254 billion edges per
second on 65,536 processing cores. This paper describes
the algorithmic design and the main classes of
optimizations that we have used to achieve these
results.",
acknowledgement = ack-nhfb,
articleno = "13",
}
@InProceedings{Satish:2012:LSE,
author = "Nadathur Satish and Changkyu Kim and Jatin Chhugani
and Pradeep Dubey",
title = "Large-scale energy-efficient graph traversal: a path
to efficient data-intensive supercomputing",
crossref = "Hollingsworth:2012:SPI",
pages = "14:1--14:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a018.pdf",
abstract = "Graph traversal is a widely used algorithm in a
variety of fields, including social networks, business
analytics, and high-performance computing among others.
There has been a push for HPC machines to be rated not
just in Petaflops, but also in ``GigaTEPS'' (billions
of traversed edges per second), and the Graph500
benchmark has been established for this purpose. Graph
traversal on single nodes has been well studied and
optimized on modern CPU architectures. However, current
cluster implementations suffer from high latency data
communication with large volumes of transfers across
nodes, leading to inefficiency in performance and
energy consumption. In this work, we show that we can
overcome these constraints using a combination of
efficient low-overhead data compression techniques to
reduce transfer volumes along with latency-hiding
techniques. Using an optimized single node graph
traversal algorithm [1], our novel cluster
optimizations result in over 6.6X performance
improvements over state-of-the-art data transfer
techniques, and almost an order of magnitude in energy
savings. Our resulting implementation of the Graph500
benchmark achieves 115 GigaTEPS on a 320-node/5120 core
Intel\reg{} Endeavor cluster with E5-2700 Sandybridge
nodes, which matches the second ranked result in the
most recent November 2011 Graph500 list [2] with about
5.6X fewer nodes. Our cluster optimizations only have a
1.8X overhead in overall performance from the
performance of the optimized single-node
implementation, and allows for near-linear scaling with
number of nodes. Our algorithm on 1024 nodes on an
Intel\reg{} Xeon\reg{} X5670 Westmere processor (with
lower per-node performance) for a large multi-Terabyte
graph attained 195 GigaTEPS in performance, proving the
high scalability of our algorithm. Our per-node
performance is the highest in the top 10 of the Nov
2011 Graph500 list.",
acknowledgement = ack-nhfb,
articleno = "14",
}
@InProceedings{Levesque:2012:HEA,
author = "John M. Levesque and Ramanan Sankaran and Ray Grout",
title = "Hybridizing {S3D} into an exascale application using
{OpenACC}: an approach for moving to multi-petaflops
and beyond",
crossref = "Hollingsworth:2012:SPI",
pages = "15:1--15:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a040.pdf",
abstract = "Hybridization is the process of converting an
application with a single level of parallelism to an
application with multiple levels of parallelism. Over
the past 15 years a majority of the applications that
run on High Performance Computing systems have employed
MPI for all of the parallelism within the application.
In the Peta-Exascale computing regime, effective
utilization of the hardware requires multiple levels of
parallelism matched to the macro architecture of the
system to achieve good performance. A hybridized code
base is performance portable when sufficient
parallelism is expressed in an architecture agnostic
form to achieve good performance on a range of
available systems. The hybridized S3D code is
performance portable across today's leading many core
and GPU accelerated systems. The OpenACC framework
allows a unified code base to be deployed for either
(Manycore CPU or Manycore CPU+GPU) while permitting
architecture specific optimizations to expose new
dimensions of parallelism to be utilized.",
acknowledgement = ack-nhfb,
articleno = "15",
}
@InProceedings{Hejazialhosseini:2012:HTS,
author = "Babak Hejazialhosseini and Diego Rossinelli and
Christian Conti and Petros Koumoutsakos",
title = "High throughput software for direct numerical
simulations of compressible two-phase flows",
crossref = "Hollingsworth:2012:SPI",
pages = "16:1--16:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a039.pdf",
abstract = "We present an open source, object-oriented software
for high throughput Direct Numerical Simulations of
compressible, two-phase flows. The Navier--Stokes
equations are discretized on uniform grids using high
order finite volume methods. The software exploits
recent CPU micro-architectures by explicit
vectorization and adopts NUMA-aware techniques as well
as data and computation reordering. We report a
compressible flow solver with unprecedented fractions
of peak performance: 45\% of the peak for a single node
(nominal performance of 840 GFLOP/s) and 30\% for a
cluster of 47'000 cores (nominal performance of 0.8
PFLOP/s). We suggest that the present work may serve as
a performance upper bound, regarding achievable
GFLOP/s, for two-phase flow solvers using adaptive mesh
refinement. The software enables 3D simulations of
shock-bubble interaction including, for the first time,
effects of diffusion and surface tension, by
efficiently employing two hundred billion computational
elements.",
acknowledgement = ack-nhfb,
articleno = "16",
}
@InProceedings{Islam:2012:MSC,
author = "Tanzima Zerin Islam and Kathryn Mohror and Saurabh
Bagchi and Adam Moody and Bronis R. de Supinski and
Rudolf Eigenmann",
title = "{McrEngine}: a scalable checkpointing system using
data-aware aggregation and compression",
crossref = "Hollingsworth:2012:SPI",
pages = "17:1--17:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a023.pdf",
abstract = "High performance computing (HPC) systems use
checkpoint-restart to tolerate failures. Typically,
applications store their states in checkpoints on a
parallel file system (PFS). As applications scale up,
checkpoint-restart incurs high overheads due to
contention for PFS resources. The high overheads force
large-scale applications to reduce checkpoint
frequency, which means more compute time is lost in the
event of failure. We alleviate this problem through a
scalable checkpoint-restart system, mcrEngine.
mcrEngine aggregates checkpoints from multiple
application processes with knowledge of the data
semantics available through widely-used I/O libraries,
e.g., HDF5 and netCDF, and compresses them. Our novel
scheme improves compressibility of checkpoints up to
115\% over simple concatenation and compression. Our
evaluation with large-scale application checkpoints
show that mcrEngine reduces checkpointing overhead by
up to 87\% and restart overhead by up to 62\% over a
baseline with no aggregation or compression.",
acknowledgement = ack-nhfb,
articleno = "17",
}
@InProceedings{Riesen:2012:ASI,
author = "Rolf Riesen and Kurt Ferreira and Dilma {Da Silva} and
Pierre Lemarinier and Dorian Arnold and Patrick G.
Bridges",
title = "Alleviating scalability issues of checkpointing
protocols",
crossref = "Hollingsworth:2012:SPI",
pages = "18:1--18:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a021.pdf",
abstract = "Current fault tolerance protocols are not sufficiently
scalable for the exascale era. The most-widely used
method, coordinated checkpointing, places enormous
demands on the I/O subsystem and imposes frequent
synchronizations. Uncoordinated protocols use message
logging which introduces message rate limitations or
undesired memory and storage requirements to hold
payload and event logs. In this paper we propose a
combination of several techniques, namely coordinated
checkpointing, optimistic message logging, and a
protocol that glues them together. This combination
eliminates some of the drawbacks of each individual
approach and proves to be an alternative for many types
of exascale applications. We evaluate performance and
scaling characteristics of this combination using
simulation and a partial implementation. While not a
universal solution, the combined protocol is suitable
for a large range of existing and future applications
that use coordinated checkpointing and enhances their
scalability.",
acknowledgement = ack-nhfb,
articleno = "18",
}
@InProceedings{Sato:2012:DMN,
author = "Kento Sato and Naoya Maruyama and Kathryn Mohror and
Adam Moody and Todd Gamblin and Bronis R. de Supinski
and Satoshi Matsuoka",
title = "Design and modeling of a non-blocking checkpointing
system",
crossref = "Hollingsworth:2012:SPI",
pages = "19:1--19:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a022.pdf",
abstract = "As the capability and component count of systems
increase, the MTBF decreases. Typically, applications
tolerate failures with checkpoint/restart to a parallel
file system (PFS). While simple, this approach can
suffer from contention for PFS resources. Multi-level
checkpointing is a promising solution. However, while
multi-level checkpointing is successful on today's
machines, it is not expected to be sufficient for
exascale class machines, which are predicted to have
orders of magnitude larger memory sizes and failure
rates. Our solution combines the benefits of
non-blocking and multi-level checkpointing. In this
paper, we present the design of our system and model
its performance. Our experiments show that our system
can improve efficiency by 1.1 to 2.0x on future
machines. Additionally, applications using our
checkpointing system can achieve high efficiency even
when using a PFS with lower bandwidth.",
acknowledgement = ack-nhfb,
articleno = "19",
}
@InProceedings{Papaioannou:2012:SAS,
author = "Thanasis G. Papaioannou and Nicolas Bonvin and Karl
Aberer",
title = "{Scalia}: an adaptive scheme for efficient multi-cloud
storage",
crossref = "Hollingsworth:2012:SPI",
pages = "20:1--20:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a026.pdf",
abstract = "A growing amount of data is produced daily resulting
in a growing demand for storage solutions. While cloud
storage providers offer a virtually infinite storage
capacity, data owners seek geographical and provider
diversity in data placement, in order to avoid vendor
lock-in and to increase availability and durability.
Moreover, depending on the customer data access
pattern, a certain cloud provider may be cheaper than
another. In this paper, we introduce Scalia, a cloud
storage brokerage solution that continuously adapts the
placement of data based on its access pattern and
subject to optimization objectives, such as storage
costs. Scalia efficiently considers repositioning of
only selected objects that may significantly lower the
storage cost. By extensive simulation experiments, we
prove the cost-effectiveness of Scalia against static
placements and its proximity to the ideal data
placement in various scenarios of data access patterns,
of available cloud storage solutions and of failures.",
acknowledgement = ack-nhfb,
articleno = "20",
}
@InProceedings{Di:2012:HLP,
author = "Sheng Di and Derrick Kondo and Walfredo Cirne",
title = "Host load prediction in a {Google} compute cloud with
a {Bayesian} model",
crossref = "Hollingsworth:2012:SPI",
pages = "21:1--21:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a025.pdf",
abstract = "Prediction of host load in Cloud systems is critical
for achieving service-level agreements. However,
accurate prediction of host load in Clouds is extremely
challenging because it fluctuates drastically at small
timescales. We design a prediction method based on
Bayes model to predict the mean load over a long-term
time interval, as well as the mean load in consecutive
future time intervals. We identify novel predictive
features of host load that capture the expectation,
predictability, trends and patterns of host load. We
also determine the most effective combinations of these
features for prediction. We evaluate our method using a
detailed one-month trace of a Google data center with
thousands of machines. Experiments show that the Bayes
method achieves high accuracy with a mean squared error
of 0.0014. Moreover, the Bayes method improves the load
prediction accuracy by 5.6-50\% compared to other
state-of-the-art methods based on moving averages,
auto-regression, and/or noise filters.",
acknowledgement = ack-nhfb,
articleno = "21",
}
@InProceedings{Malawski:2012:CDC,
author = "Maciej Malawski and Gideon Juve and Ewa Deelman and
Jarek Nabrzyski",
title = "Cost- and deadline-constrained provisioning for
scientific workflow ensembles in {IaaS} clouds",
crossref = "Hollingsworth:2012:SPI",
pages = "22:1--22:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a024.pdf",
abstract = "Large-scale applications expressed as scientific
workflows are often grouped into ensembles of
inter-related workflows. In this paper, we address a
new and important problem concerning the efficient
management of such ensembles under budget and deadline
constraints on Infrastructure- as-a-Service (IaaS)
clouds. We discuss, develop, and assess algorithms
based on static and dynamic strategies for both task
scheduling and resource provisioning. We perform the
evaluation via simulation using a set of scientific
workflow ensembles with a broad range of budget and
deadline parameters, taking into account uncertainties
in task runtime estimations, provisioning delays, and
failures. We find that the key factor determining the
performance of an algorithm is its ability to decide
which workflows in an ensemble to admit or reject for
execution. Our results show that an admission procedure
based on workflow structure and estimates of task
runtimes can significantly improve the quality of
solutions.",
acknowledgement = ack-nhfb,
articleno = "22",
}
@InProceedings{Lee:2012:EED,
author = "Seyong Lee and Jeffrey S. Vetter",
title = "Early evaluation of directive-based {GPU} programming
models for productive exascale computing",
crossref = "Hollingsworth:2012:SPI",
pages = "23:1--23:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a051.pdf",
abstract = "Graphics Processing Unit (GPU)-based parallel computer
architectures have shown increased popularity as a
building block for high performance computing, and
possibly for future Exascale computing. However, their
programming complexity remains as a major hurdle for
their widespread adoption. To provide better
abstractions for programming GPU architectures,
researchers and vendors have proposed several
directive-based GPU programming models. These
directive-based models provide different levels of
abstraction, and required different levels of
programming effort to port and optimize applications.
Understanding these differences among these new models
provides valuable insights on their applicability and
performance potential. In this paper, we evaluate
existing directive-based models by porting thirteen
application kernels from various scientific domains to
use CUDA GPUs, which, in turn, allows us to identify
important issues in the functionality, scalability,
tunability, and debuggability of the existing models.
Our evaluation shows that directive-based models can
achieve reasonable performance, compared to
hand-written GPU codes.",
acknowledgement = ack-nhfb,
articleno = "23",
}
@InProceedings{Pienaar:2012:AGS,
author = "Jacques A. Pienaar and Srimat Chakradhar and Anand
Raghunathan",
title = "Automatic generation of software pipelines for
heterogeneous parallel systems",
crossref = "Hollingsworth:2012:SPI",
pages = "24:1--24:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a049.pdf",
abstract = "Pipelining is a well-known approach to increasing
parallelism and performance. We address the problem of
software pipelining for heterogeneous parallel
platforms that consist of different multi-core and
many-core processing units. In this context, pipelining
involves two key steps---partitioning an application
into stages and mapping and scheduling the stages onto
the processing units of the heterogeneous platform. We
show that the inter-dependency between these steps is a
critical challenge that must be addressed in order to
achieve high performance. We propose an Automatic
Heterogeneous Pipelining framework (ahp) that generates
an optimized pipelined implementation of a program from
an annotated unpipelined specification. Across three
complex applications (image classification, object
detection, and document retrieval) and two
heterogeneous platforms (Intel Xeon multi-core CPUs
with Intel MIC and NVIDIA GPGPU accelerators), ahp
achieves a throughput improvement of up to 1.53x (1.37x
on average) over a heterogeneous baseline that exploits
data and task parallelism.",
acknowledgement = ack-nhfb,
articleno = "24",
}
@InProceedings{Chen:2012:AMC,
author = "Linchuan Chen and Xin Huo and Gagan Agrawal",
title = "Accelerating {MapReduce} on a coupled {CPU--GPU}
architecture",
crossref = "Hollingsworth:2012:SPI",
pages = "25:1--25:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a050.pdf",
abstract = "The work presented here is driven by two observations.
First, heterogeneous architectures that integrate a CPU
and a GPU on the same chip are emerging, and hold much
promise for supporting power-efficient and scalable
high performance computing. Second, MapReduce has
emerged as a suitable framework for simplified parallel
application development for many classes of
applications, including data mining and machine
learning applications that benefit from accelerators.
This paper focuses on the challenge of scaling a
MapReduce application using the CPU and GPU together in
an integrated architecture. We develop different
methods for dividing the work, which are the
map-dividing scheme, where map tasks are divided
between both devices, and the pipelining scheme, which
pipelines the map and the reduce stages on different
devices. We develop dynamic work distribution schemes
for both the approaches. To achieve high load balance
while keeping scheduling costs low, we use a runtime
tuning method to adjust task block sizes for the
map-dividing scheme. Our implementation of MapReduce is
based on a continuous reduction method, which avoids
the memory overheads of storing key-value pairs. We
have evaluated the different design decisions using 5
popular MapReduce applications. For 4 of the
applications, our system achieves 1.21 to 2.1 speedup
over the better of the CPU-only and GPU-only versions.
The speedups over a single CPU core execution range
from 3.25 to 28.68. The runtime tuning method we have
developed achieves very low load imbalance, while
keeping scheduling overheads low. Though our current
work is specific to MapReduce, many underlying ideas
are also applicable towards intra-node acceleration of
other applications on integrated CPU-GPU nodes.",
acknowledgement = ack-nhfb,
articleno = "25",
}
@InProceedings{Igual:2012:UHP,
author = "Francisco D. Igual and Murtaza Ali and Arnon Friedmann
and Eric Stotzer and Timothy Wentz and Robert A. van de
Geijn",
title = "Unleashing the high performance and low power of
multi-core {DSPs} for general-purpose {HPC}",
crossref = "Hollingsworth:2012:SPI",
pages = "26:1--26:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a070.pdf",
abstract = "Take a multicore Digital Signal Processor (DSP) chip
designed for cellular base stations and radio network
controllers, add floating-point capabilities to support
4G networks, and out of thin air a HPC engine is born.
The potential for HPC is clear: It promises 128 GFLOPS
(single precision) for 10 Watts; It is used in millions
of network related devices and hence benefits from
economies of scale; It should be simpler to program
than a GPU. Simply put, it is fast, green, and cheap.
But is it easy to use? In this paper, we show how this
potential can be applied to general-purpose high
performance computing, more specifically to dense
matrix computations, without major changes in existing
codes and methodologies, and with excellent performance
and power consumption numbers.",
acknowledgement = ack-nhfb,
articleno = "26",
}
@InProceedings{Chang:2012:SNS,
author = "Li-Wen Chang and John A. Stratton and Hee-Seok Kim and
Wen-Mei W. Hwu",
title = "A scalable, numerically stable, high-performance
tridiagonal solver using {GPUs}",
crossref = "Hollingsworth:2012:SPI",
pages = "27:1--27:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a071.pdf",
abstract = "In this paper, we present a scalable, numerically
stable, high-performance tridiagonal solver. The solver
is based on the SPIKE algorithm for partitioning a
large matrix into small independent matrices, which can
be solved in parallel. For each small matrix, our
solver applies a general 1-by-1 or 2-by-2 diagonal
pivoting algorithm, which is also known to be
numerically stable. Our paper makes two major
contributions. First, our solver is the first
numerically stable tridiagonal solver for GPUs. Our
solver provides comparable quality of stable solutions
to Intel MKL and Matlab, at speed comparable to the GPU
tridiagonal solvers in existing packages like CUSPARSE.
It is also scalable to multiple GPUs and CPUs. Second,
we present and analyze two key optimization strategies
for our solver: a high-throughput data layout
transformation for memory efficiency, and a dynamic
tiling approach for reducing the memory access
footprint caused by branch divergence.",
acknowledgement = ack-nhfb,
articleno = "27",
}
@InProceedings{Park:2012:EBB,
author = "Jongsoo Park and Ping Tak Peter Tang and Mikhail
Smelyanskiy and Daehyun Kim and Thomas Benson",
title = "Efficient backprojection-based synthetic aperture
radar computation with many-core processors",
crossref = "Hollingsworth:2012:SPI",
pages = "28:1--28:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a072.pdf",
abstract = "Tackling computationally challenging problems with
high efficiency often requires the combination of
algorithmic innovation, advanced architecture, and
thorough exploitation of parallelism. We demonstrate
this synergy through synthetic aperture radar (SAR) via
backprojection, an image reconstruction method that can
require hundreds of TFLOPS. Computation cost is
significantly reduced by our new algorithm of
approximate strength reduction; data movement cost is
economized by software locality optimizations
facilitated by advanced architecture support;
parallelism is fully harnessed in various patterns and
granularities. We deliver over 35 billion
backprojections per second throughput per compute node
on an Intel\reg{} Xeon\reg{} E5-2670-based cluster,
equipped with Intel\reg{} Xeon PhiTM coprocessors. This
corresponds to processing a 3K x 3K image within a
second using a single node. Our study can be extended
to other settings: backprojection is applicable
elsewhere including medical imaging, approximate
strength reduction is a general code transformation
technique, and many-core processors are emerging as a
solution to energy-efficient computing.",
acknowledgement = ack-nhfb,
articleno = "28",
}
@InProceedings{Li:2012:PFA,
author = "Peng Li and Guodong Li and Ganesh Gopalakrishnan",
title = "Parametric flows: automated behavior equivalencing for
symbolic analysis of races in {CUDA} programs",
crossref = "Hollingsworth:2012:SPI",
pages = "29:1--29:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a009.pdf",
abstract = "The growing scale of concurrency requires automated
abstraction techniques to cut down the effort in
concurrent system analysis. In this paper, we show that
the high degree of behavioral symmetry present in GPU
programs allows CUDA race detection to be dramatically
simplified through abstraction. Our abstraction
techniques is one of automatically creating parametric
flows ---control-flow equivalence classes of threads
that diverge in the same manner---and checking for data
races only across a pair of threads per parametric
flow. We have implemented this approach as an extension
of our recently proposed GKLEE symbolic analysis
framework and show that all our previous results are
dramatically improved in that (i) the parametric
flow-based analysis takes far less time, and (ii)
because of the much higher scalability of the analysis,
we can detect even more data race situations that were
previously missed by GKLEE because it was forced to
downscale examples to limit analysis complexity.
Moreover, the parametric flow-based analysis is
applicable to other programs with SPMD models.",
acknowledgement = ack-nhfb,
articleno = "29",
}
@InProceedings{Hilbrich:2012:MRE,
author = "Tobias Hilbrich and Joachim Protze and Martin Schulz
and Bronis R. de Supinski and Matthias S. M{\"u}ller",
title = "{MPI} runtime error detection with {MUST}: advances in
deadlock detection",
crossref = "Hollingsworth:2012:SPI",
pages = "30:1--30:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a010.pdf",
abstract = "The widely used Message Passing Interface (MPI) is
complex and rich. As a result, application developers
require automated tools to avoid and to detect MPI
programming errors. We present the Marmot Umpire
Scalable Tool (MUST) that detects such errors with
significantly increased scalability. We present
improvements to our graph-based deadlock detection
approach for MPI, which cover future MPI extensions.
Our enhancements also check complex MPI constructs that
no previous graph-based detection approach handled
correctly. Finally, we present optimizations for the
processing of MPI operations that reduce runtime
deadlock detection overheads. Existing approaches often
require O ( p ) analysis time per MPI operation, for p
processes. We empirically observe that our improvements
lead to sub-linear or better analysis time per
operation for a wide range of real world
applications.",
acknowledgement = ack-nhfb,
articleno = "30",
}
@InProceedings{Bhatele:2012:NVP,
author = "Abhinav Bhatele and Todd Gamblin and Katherine E.
Isaacs and Brian T. N. Gunney and Martin Schulz and
Peer-Timo Bremer and Bernd Hamann",
title = "Novel views of performance data to analyze large-scale
adaptive applications",
crossref = "Hollingsworth:2012:SPI",
pages = "31:1--31:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a011.pdf",
abstract = "Performance analysis of parallel scientific codes is
becoming increasingly difficult due to the rapidly
growing complexity of applications and architectures.
Existing tools fall short in providing intuitive views
that facilitate the process of performance debugging
and tuning. In this paper, we extend recent ideas of
projecting and visualizing performance data for faster,
more intuitive analysis of applications. We collect
detailed per-level and per-phase measurements for a
dynamically load-balanced, structured AMR library and
project per-core data collected in the hardware domain
on to the application's communication topology. We show
how our projections and visualizations lead to a rapid
diagnosis of and mitigation strategy for a previously
elusive scaling bottleneck in the library that is hard
to detect using conventional tools. Our new insights
have resulted in a 22\% performance improvement for a
65,536-core run of the AMR library on an IBM Blue
Gene/P system.",
acknowledgement = ack-nhfb,
articleno = "31",
}
@InProceedings{Wu:2012:RRA,
author = "Donghong Wu and Bingsheng He and Xueyan Tang and
Jianliang Xu and Minyi Guo",
title = "{RAMZzz}: rank-aware {DRAM} power management with
dynamic migrations and demotions",
crossref = "Hollingsworth:2012:SPI",
pages = "32:1--32:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a042.pdf",
abstract = "Main memory is a significant energy consumer which may
contribute to over 40\% of the total system power, and
will become more significant for server machines with
more main memory. In this paper, we propose a novel
memory system design named RAMZzz with rank-aware
energy saving optimizations. Specifically, we rely on a
memory controller to monitor the memory access
locality, and group the pages with similar access
locality into the same rank. We further develop dynamic
page migrations to adapt to data access patterns, and a
prediction model to estimate the demotion time for
accurate control on power state transitions. We
experimentally compare our algorithm with other energy
saving policies with cycle-accurate simulation.
Experiments with benchmark workloads show that RAMZzz
achieves significant improvement on energy-delay and
energy consumption over other power saving
techniques.",
acknowledgement = ack-nhfb,
articleno = "32",
}
@InProceedings{Li:2012:MAG,
author = "Sheng Li and Doe Hyun Yoon and Ke Chen and Jishen Zhao
and Jung Ho Ahn and Jay B. Brockman and Yuan Xie and
Norman P. Jouppi",
title = "{MAGE}: adaptive granularity and {ECC} for resilient
and power efficient memory systems",
crossref = "Hollingsworth:2012:SPI",
pages = "33:1--33:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a041.pdf",
abstract = "Resiliency is one of the toughest challenges in
high-performance computing, and memory accounts for a
significant fraction of errors. Providing strong error
tolerance in memory usually requires a wide memory
channel that incurs a large access granularity (hence,
a large cache line). Unfortunately, applications with
limited spatial locality waste memory power and
bandwidth on systems with a large access granularity.
Thus, careful design considerations must be made to
balance memory system performance, power efficiency,
and resiliency. In this paper, we propose MAGE, a
Memory system with Adaptive Granularity and ECC, to
achieve high performance, power efficiency, and
resiliency. MAGE can adapt memory access granularities
and ECC schemes to applications with different memory
behaviors. Our experiments show that MAGE achieves more
than a 28\% energy-delay product improvement, compared
to the best existing systems with static granularity
and ECC.",
acknowledgement = ack-nhfb,
articleno = "33",
}
@InProceedings{Ren:2012:PWA,
author = "Yufei Ren and Tan Li and Dantong Yu and Shudong Jin
and Thomas Robertazzi and Brian L. Tierney and Eric
Pouyoul",
title = "Protocols for wide-area data-intensive applications:
design and performance issues",
crossref = "Hollingsworth:2012:SPI",
pages = "34:1--34:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a059.pdf",
abstract = "Providing high-speed data transfer is vital to various
data-intensive applications. While there have been
remarkable technology advances to provide
ultra-high-speed network bandwidth, existing protocols
and applications may not be able to fully utilize the
bare-metal bandwidth due to their inefficient design.
We identify the same problem remains in the field of
Remote Direct Memory Access (RDMA) networks. RDMA
offloads TCP/IP protocols to hardware devices. However,
its benefits have not been fully exploited due to the
lack of efficient software and application protocols,
in particular in wide-area networks. In this paper, we
address the design choices to develop such protocols.
We describe a protocol implemented as part of a
communication middleware. The protocol has its flow
control, connection management, and task
synchronization. It maximizes the parallelism of RDMA
operations. We demonstrate its performance benefit on
various local and wide-area testbeds, including the DOE
ANI testbed with RoCE links and InfiniBand links.",
acknowledgement = ack-nhfb,
articleno = "34",
}
@InProceedings{Islam:2012:HPR,
author = "N. S. Islam and M. W. Rahman and J. Jose and R.
Rajachandrasekar and H. Wang and H. Subramoni and C.
Murthy and D. K. Panda",
title = "High performance {RDMA}-based design of {HDFS} over
{InfiniBand}",
crossref = "Hollingsworth:2012:SPI",
pages = "35:1--35:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a058.pdf",
abstract = "Hadoop Distributed File System (HDFS) acts as the
primary storage of Hadoop and has been adopted by
reputed organizations (Facebook, Yahoo! etc.) due to
its portability and fault-tolerance. The existing
implementation of HDFS uses Java-socket interface for
communication which delivers suboptimal performance in
terms of latency and throughput. For data-intensive
applications, network performance becomes key component
as the amount of data being stored and replicated to
HDFS increases. In this paper, we present a novel
design of HDFS using Remote Direct Memory Access (RDMA)
over InfiniBand via JNI interfaces. Experimental
results show that, for 5GB HDFS file writes, the new
design reduces the communication time by 87\% and 30\%
over 1Gigabit Ethernet (1GigE) and IP-over-InfiniBand
(IPoIB), respectively, on QDR platform (32Gbps). For
HBase, the Put operation performance is improved by
26\% with our design. To the best of our knowledge,
this is the first design of HDFS over InfiniBand
networks.",
acknowledgement = ack-nhfb,
articleno = "35",
}
@InProceedings{Dichev:2012:ERN,
author = "Kiril Dichev and Fergal Reid and Alexey Lastovetsky",
title = "Efficient and reliable network tomography in
heterogeneous networks using {BitTorrent} broadcasts
and clustering algorithms",
crossref = "Hollingsworth:2012:SPI",
pages = "36:1--36:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a060.pdf",
abstract = "In the area of network performance and discovery,
network tomography focuses on reconstructing network
properties using only end-to-end measurements at the
application layer. One challenging problem in network
tomography is reconstructing available bandwidth along
all links during multiple source/multiple destination
transmissions. The traditional measurement procedures
used for bandwidth tomography are extremely time
consuming. We propose a novel solution to this problem.
Our method counts the fragments exchanged during a
BitTorrent broadcast. While this measurement has a high
level of randomness, it can be obtained very
efficiently, and aggregated into a reliable metric.
This data is then analyzed with state-of-the-art
algorithms, which correctly reconstruct logical
clusters of nodes interconnected by high bandwidth, as
well as bottlenecks between these logical clusters. Our
experiments demonstrate that the proposed two-phase
approach efficiently solves the presented problem for a
number of settings on a complex grid infrastructure.",
acknowledgement = ack-nhfb,
articleno = "36",
}
@InProceedings{Malakar:2012:DCS,
author = "Preeti Malakar and Thomas George and Sameer Kumar and
Rashmi Mittal and Vijay Natarajan and Yogish Sabharwal
and Vaibhav Saxena and Sathish S. Vadhiyar",
title = "A divide and conquer strategy for scaling weather
simulations with multiple regions of interest",
crossref = "Hollingsworth:2012:SPI",
pages = "37:1--37:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a105.pdf",
abstract = "Accurate and timely prediction of weather phenomena,
such as hurricanes and flash floods, require
high-fidelity compute intensive simulations of multiple
finer regions of interest within a coarse simulation
domain. Current weather applications execute these
nested simulations sequentially using all the available
processors, which is sub-optimal due to their
sub-linear scalability. In this work, we present a
strategy for parallel execution of multiple nested
domain simulations based on partitioning the 2-D
processor grid into disjoint rectangular regions
associated with each domain. We propose a novel
combination of performance prediction, processor
allocation methods and topology-aware mapping of the
regions on torus interconnects. Experiments on IBM Blue
Gene systems using WRF show that the proposed
strategies result in performance improvement of up to
33\% with topology-oblivious mapping and up to
additional 7\% with topology-aware mapping over the
default sequential strategy.",
acknowledgement = ack-nhfb,
articleno = "37",
}
@InProceedings{Rietmann:2012:FAS,
author = "Max Rietmann and Peter Messmer and Tarje Nissen-Meyer
and Daniel Peter and Piero Basini and Dimitri
Komatitsch and Olaf Schenk and Jeroen Tromp and Lapo
Boschi and Domenico Giardini",
title = "Forward and adjoint simulations of seismic wave
propagation on emerging large-scale {GPU}
architectures",
crossref = "Hollingsworth:2012:SPI",
pages = "38:1--38:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a104.pdf",
abstract = "Computational seismology is an area of wide
sociological and economic impact, ranging from
earthquake risk assessment to subsurface imaging and
oil and gas exploration. At the core of these
simulations is the modeling of wave propagation in a
complex medium. Here we report on the extension of the
high-order finite-element seismic wave simulation
package SPECFEM3D to support the largest scale hybrid
and homogeneous supercomputers. Starting from an
existing highly tuned MPI code, we migrated to a CUDA
version. In order to be of immediate impact to the
science mission of computational seismologists, we had
to port the entire production package, rather than just
individual kernels. One of the challenges in
parallelizing finite element codes is the potential for
race conditions during the assembly phase. We therefore
investigated different methods such as mesh coloring or
atomic updates on the GPU. In order to achieve strong
scaling, we needed to ensure good overlap of data
motion at all levels, including internode and
host-accelerator transfers. Finally we carefully tuned
the GPU implementation. The new MPI/CUDA solver
exhibits excellent scalability and achieves speedup on
a node-to-node basis over the carefully tuned
equivalent multi-core MPI solver. To demonstrate the
performance of both the forward and adjoint
functionality, we present two case studies run on the
Cray XE6 CPU and Cray XK6 GPU architectures up to 896
nodes: (1) focusing on most commonly used forward
simulations, we simulate seismic wave propagation
generated by earthquakes in Turkey, and (2) testing the
most complex seismic inversion type of the package, we
use ambient seismic noise to image 3-D crust and mantle
structure beneath western Europe.",
acknowledgement = ack-nhfb,
articleno = "38",
}
@InProceedings{Nguyen:2012:BTM,
author = "Tan Nguyen and Pietro Cicotti and Eric Bylaska and Dan
Quinlan and Scott B. Baden",
title = "{Bamboo}: translating {MPI} applications to a
latency-tolerant, data-driven form",
crossref = "Hollingsworth:2012:SPI",
pages = "39:1--39:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a032.pdf",
abstract = "We present Bamboo, a custom source-to-source
translator that transforms MPI C source into a
data-driven form that automatically overlaps
communication with available computation. Running on up
to 98304 processors of NERSC's Hopper system, we
observe that Bamboo's overlap capability speeds up MPI
implementations of a 3D Jacobi iterative solver and
Cannon's matrix multiplication. Bamboo's generated code
meets or exceeds the performance of hand optimized MPI,
which includes split-phase coding, the method
classically employed to hide communication. We achieved
our results with only modest amounts of programmer
annotation and no intrusive reprogramming of the
original application source.",
acknowledgement = ack-nhfb,
articleno = "39",
}
@InProceedings{Bandishti:2012:TSC,
author = "Vinayaka Bandishti and Irshad Pananilath and Uday
Bondhugula",
title = "Tiling stencil computations to maximize parallelism",
crossref = "Hollingsworth:2012:SPI",
pages = "40:1--40:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a031.pdf",
abstract = "Most stencil computations allow tile-wise concurrent
start, i.e., there always exists a face of the
iteration space and a set of tiling hyperplanes such
that all tiles along that face can be started
concurrently. This provides load balance and maximizes
parallelism. However, existing automatic tiling
frameworks often choose hyperplanes that lead to
pipelined start-up and load imbalance. We address this
issue with a new tiling technique that ensures
concurrent start-up as well as perfect load-balance
whenever possible. We first provide necessary and
sufficient conditions on tiling hyperplanes to enable
concurrent start for programs with affine data
accesses. We then provide an approach to find such
hyperplanes. Experimental evaluation on a 12-core Intel
Westmere shows that our code is able to outperform a
tuned domain-specific stencil code generator by 4\% to
27\%, and previous compiler techniques by a factor of
2x to 10.14 x.",
acknowledgement = ack-nhfb,
articleno = "40",
}
@InProceedings{Ding:2012:CDF,
author = "Wei Ding and Yuanrui Zhang and Mahmut Kandemir and
Seung Woo Son",
title = "Compiler-directed file layout optimization for
hierarchical storage systems",
crossref = "Hollingsworth:2012:SPI",
pages = "41:1--41:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a030.pdf",
abstract = "File layout of array data is a critical factor that
effects the behavior of storage caches, and has so far
taken not much attention in the context of hierarchical
storage systems. The main contribution of this paper is
a compiler-driven file layout optimization scheme for
hierarchical storage caches. This approach, fully
automated within an optimizing compiler, analyzes a
multi-threaded application code and determines a file
layout for each disk-resident array referenced by the
code, such that the performance of the target storage
cache hierarchy is maximized. We tested our approach
using 16 I/O intensive application programs and
compared its performance against two previously
proposed approaches under different cache space
management schemes. Our experimental results show that
the proposed approach improves the execution time of
these parallel applications by 23.7\% on average.",
acknowledgement = ack-nhfb,
articleno = "41",
}
@InProceedings{Tang:2012:FLC,
author = "Ping Tak Peter Tang and Jongsoo Park and Daehyun Kim
and Vladimir Petrov",
title = "A framework for low-communication {$1$-D} {FFT}",
crossref = "Hollingsworth:2012:SPI",
pages = "42:1--42:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a043.pdf",
abstract = "In high-performance computing on distributed-memory
systems, communication often represents a significant
part of the overall execution time. The relative cost
of communication will certainly continue to rise as
compute-density growth follows the current technology
and industry trends. Design of lower-communication
alternatives to fundamental computational algorithms
has become an important field of research. For
distributed 1-D FFT, communication cost has hitherto
remained high as all industry-standard implementations
perform three all-to-all internode data exchanges (also
called global transposes). These communication steps
indeed dominate execution time. In this paper, we
present a mathematical framework from which many
single-all-to-all and easy-to-implement 1-D FFT
algorithms can be derived. For large-scale problems,
our implementation can be twice as fast as leading FFT
libraries on state-of-the-art computer clusters.
Moreover, our framework allows tradeoff between
accuracy and performance, further boosting performance
if reduced accuracy is acceptable.",
acknowledgement = ack-nhfb,
articleno = "42",
}
@InProceedings{Sundar:2012:PGA,
author = "Hari Sundar and George Biros and Carsten Burstedde and
Johann Rudi and Omar Ghattas and Georg Stadler",
title = "Parallel geometric-algebraic multigrid on unstructured
forests of octrees",
crossref = "Hollingsworth:2012:SPI",
pages = "43:1--43:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a045.pdf",
abstract = "We present a parallel multigrid method for solving
variable-coefficient elliptic partial differential
equations on arbitrary geometries using highly adapted
meshes. Our method is designed for meshes that are
built from an unstructured hexahedral macro mesh, in
which each macro element is adaptively refined as an
octree. This forest-of-octrees approach enables us to
generate meshes for complex geometries with arbitrary
levels of local refinement. We use geometric multigrid
(GMG) for each of the octrees and algebraic multigrid
(AMG) as the coarse grid solver. We designed our GMG
sweeps to entirely avoid collectives, thus minimizing
communication cost. We present weak and strong scaling
results for the 3D variable-coefficient Poisson problem
that demonstrate high parallel scalability. As a
highlight, the largest problem we solve is on a
non-uniform mesh with 100 billion unknowns on 262,144
cores of NCCS's Cray XK6 ``Jaguar''; in this solve we
sustain 272 TFlops/s.",
acknowledgement = ack-nhfb,
articleno = "43",
}
@InProceedings{Nukada:2012:SMG,
author = "Akira Nukada and Kento Sato and Satoshi Matsuoka",
title = "Scalable multi-{GPU} {$3$-D} {FFT} for {TSUBAME 2.0}
supercomputer",
crossref = "Hollingsworth:2012:SPI",
pages = "44:1--44:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a044.pdf",
abstract = "For scalable 3-D FFT computation using multiple GPUs,
efficient all-to-all communication between GPUs is the
most important factor in good performance.
Implementations with point-to-point MPI library
functions and CUDA memory copy APIs typically exhibit
very large overheads especially for small message sizes
in all-to-all communications between many nodes. We
propose several schemes to minimize the overheads,
including employment of lower-level API of InfiniBand
to effectively overlap intra- and inter-node
communication, as well as auto-tuning strategies to
control scheduling and determine rail assignments. As a
result we achieve very good strong scalability as well
as good performance, up to 4.8TFLOPS using 256 nodes of
TSUBAME 2.0 Supercomputer (768 GPUs) in double
precision.",
acknowledgement = ack-nhfb,
articleno = "44",
}
@InProceedings{Doi:2012:PSL,
author = "Jun Doi",
title = "Peta-scale lattice quantum chromodynamics on a {Blue
Gene/Q} supercomputer",
crossref = "Hollingsworth:2012:SPI",
pages = "45:1--45:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a068.pdf",
abstract = "Lattice Quantum Chromodynamics (QCD) is one of the
most challenging applications running on massively
parallel supercomputers. To reproduce these physical
phenomena on a supercomputer, a precise simulation is
demanded requiring well optimized and scalable code. We
have optimized lattice QCD programs on Blue Gene family
supercomputers and shown the strength in lattice QCD
simulation. Here we optimized on the third generation
Blue Gene/Q supercomputer; (i) by changing the data
layout, (ii) by exploiting new SIMD instruction sets,
and (iii) by pipelining boundary data exchange to
overlap communication and calculation. The optimized
lattice QCD program shows excellent weak scalability on
the large scale Blue Gene/Q system, and with 16 racks
we sustained 1.08 Pflop/s, 32.1\% of the theoretical
peak performance, including the conjugate gradient
solver routines.",
acknowledgement = ack-nhfb,
articleno = "45",
}
@InProceedings{Sarje:2012:MPX,
author = "Abhinav Sarje and Xiaoye S. Li and Slim Chourou and
Elaine R. Chan and Alexander Hexemer",
title = "Massively parallel {X-ray} scattering simulations",
crossref = "Hollingsworth:2012:SPI",
pages = "46:1--46:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a067.pdf",
abstract = "Although present X-ray scattering techniques can
provide tremendous information on the nano-structural
properties of materials that are valuable in the design
and fabrication of energy-relevant nano-devices, a
primary challenge remains in the analyses of such data.
In this paper we describe a high-performance, flexible,
and scalable Grazing Incidence Small Angle X-ray
Scattering simulation algorithm and codes that we have
developed on multi-core/CPU and many-core/GPU clusters.
We discuss in detail our implementation, optimization
and performance on these platforms. Our results show
speedups of ~125x on a Fermi-GPU and ~20x on a Cray-XE6
24-core node, compared to a sequential CPU code, with
near linear scaling on multi-node clusters. To our
knowledge, this is the first GISAXS simulation code
that is flexible to compute scattered light intensities
in all spatial directions allowing full reconstruction
of GISAXS patterns for any complex structures and with
high-resolutions while reducing simulation times from
months to minutes.",
acknowledgement = ack-nhfb,
articleno = "46",
}
@InProceedings{Baker:2012:HPR,
author = "C. Baker and G. Davidson and T. M. Evans and S.
Hamilton and J. Jarrell and W. Joubert",
title = "High performance radiation transport simulations:
preparing for {Titan}",
crossref = "Hollingsworth:2012:SPI",
pages = "47:1--47:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a069.pdf",
abstract = "In this paper we describe the Denovo code system.
Denovo solves the six-dimensional, steady-state, linear
Boltzmann transport equation, of central importance to
nuclear technology applications such as reactor core
analysis (neutronics), radiation shielding, nuclear
forensics and radiation detection. The code features
multiple spatial differencing schemes, state-of-the-art
linear solvers, the Koch-Baker-Alcouffe (KBA)
parallel-wavefront sweep algorithm for inverting the
transport operator, a new multilevel energy
decomposition method scaling to hundreds of thousands
of processing cores, and a modern, novel code
architecture that supports straightforward integration
of new features. In this paper we discuss the
performance of Denovo on the 20+ petaflop ORNL
GPU-based system, Titan. We describe algorithms and
techniques used to exploit the capabilities of Titan's
heterogeneous compute node architecture and the
challenges of obtaining good parallel performance for
this sparse hyperbolic PDE solver containing inherently
sequential computations. Numerical results
demonstrating Denovo performance on early Titan
hardware are presented.",
acknowledgement = ack-nhfb,
articleno = "47",
}
@InProceedings{Jenkins:2012:BPL,
author = "John Jenkins and Eric R. Schendel and Sriram
Lakshminarasimhan and David A. {Boyuka II} and Terry
Rogers and Stephane Ethier and Robert Ross and Scott
Klasky and Nagiza F. Samatova",
title = "Byte-precision level of detail processing for variable
precision analytics",
crossref = "Hollingsworth:2012:SPI",
pages = "48:1--48:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a088.pdf",
abstract = "I/O bottlenecks in HPC applications are becoming a
more pressing problem as compute capabilities continue
to outpace I/O capabilities. While double-precision
simulation data often must be stored losslessly, the
loss of some of the fractional component may introduce
acceptably small errors to many types of scientific
analyses. Given this observation, we develop a
precision level of detail (APLOD) library, which
partitions double-precision datasets along user-defined
byte boundaries. APLOD parameterizes the analysis
accuracy-I/O performance tradeoff, bounds maximum
relative error, maintains I/O access patterns compared
to full precision, and operates with low overhead.
Using ADIOS as an I/O use-case, we show proportional
reduction in disk access time to the degree of
precision. Finally, we show the effects of partial
precision analysis on accuracy for operations such as
$k$-means and Fourier analysis, finding a strong
applicability for the use of varying degrees of
precision to reduce the cost of analyzing extreme-scale
data.",
acknowledgement = ack-nhfb,
articleno = "48",
}
@InProceedings{Bennett:2012:CST,
author = "Janine C. Bennett and Hasan Abbasi and Peer-Timo
Bremer and Ray Grout and Attila Gyulassy and Tong Jin
and Scott Klasky and Hemanth Kolla and Manish Parashar
and Valerio Pascucci and Philippe Pebay and David
Thompson and Hongfeng Yu and Fan Zhang and Jacqueline
Chen",
title = "Combining in-situ and in-transit processing to enable
extreme-scale scientific analysis",
crossref = "Hollingsworth:2012:SPI",
pages = "49:1--49:9",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a089.pdf",
abstract = "With the onset of extreme-scale computing, I/O
constraints make it increasingly difficult for
scientists to save a sufficient amount of raw
simulation data to persistent storage. One potential
solution is to change the data analysis pipeline from a
post-process centric to a concurrent approach based on
either in-situ or in-transit processing. In this
context computations are considered in-situ if they
utilize the primary compute resources, while in-transit
processing refers to offloading computations to a set
of secondary resources using asynchronous data
transfers. In this paper we explore the design and
implementation of three common analysis techniques
typically performed on large-scale scientific
simulations: topological analysis, descriptive
statistics, and visualization. We summarize algorithmic
developments, describe a resource scheduling system to
coordinate the execution of various analysis workflows,
and discuss our implementation using the DataSpaces and
ADIOS frameworks that support efficient data movement
between in-situ and in-transit computations. We
demonstrate the efficiency of our lightweight, flexible
framework by deploying it on the Jaguar XK6 to analyze
data generated by S3D, a massively parallel turbulent
combustion code. Our framework allows scientists
dealing with the data deluge at extreme scale to
perform analyses at increased temporal resolutions,
mitigate I/O costs, and significantly improve the time
to insight.",
acknowledgement = ack-nhfb,
articleno = "49",
}
@InProceedings{Kumar:2012:EDR,
author = "Sidharth Kumar and Venkatram Vishwanath and Philip
Carns and Joshua A. Levine and Robert Latham and
Giorgio Scorzelli and Hemanth Kolla and Ray Grout and
Robert Ross and Michael E. Papka and Jacqueline Chen
and Valerio Pascucci",
title = "Efficient data restructuring and aggregation for {I/O}
acceleration in {PIDX}",
crossref = "Hollingsworth:2012:SPI",
pages = "50:1--50:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a090.pdf",
abstract = "Hierarchical, multiresolution data representations
enable interactive analysis and visualization of
large-scale simulations. One promising application of
these techniques is to store high performance computing
simulation output in a hierarchical Z (HZ) ordering
that translates data from a Cartesian coordinate scheme
to a one-dimensional array ordered by locality at
different resolution levels. However, when the
dimensions of the simulation data are not an even power
of 2, parallel HZ ordering produces sparse memory and
network access patterns that inhibit I/O performance.
This work presents a new technique for parallel HZ
ordering of simulation datasets that restructures
simulation data into large (power of 2) blocks to
facilitate efficient I/O aggregation. We perform both
weak and strong scaling experiments using the S3D
combustion application on both Cray-XE6 (65,536 cores)
and IBM Blue Gene/P (131,072 cores) platforms. We
demonstrate that data can be written in hierarchical,
multiresolution format with performance competitive to
that of native data-ordering methods.",
acknowledgement = ack-nhfb,
articleno = "50",
}
@InProceedings{Kambadur:2012:MIB,
author = "Melanie Kambadur and Tipp Moseley and Rick Hank and
Martha A. Kim",
title = "Measuring interference between live datacenter
applications",
crossref = "Hollingsworth:2012:SPI",
pages = "51:1--51:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a036.pdf",
abstract = "Application interference is prevalent in datacenters
due to contention over shared hardware resources.
Unfortunately, understanding interference in live
datacenters is more difficult than in controlled
environments or on simpler architectures. Most
approaches to mitigating interference rely on data that
cannot be collected efficiently in a production
environment. This work exposes eight specific
complexities of live datacenters that constrain
measurement of interference. It then introduces new,
generic measurement techniques for analyzing
interference in the face of these challenges and
restrictions. We use the measurement techniques to
conduct the first large-scale study of application
interference in live production datacenter workloads.
Data is measured across 1000 12-core Google servers
observed to be running 1102 unique applications.
Finally, our work identifies several opportunities to
improve performance that use only the available data;
these opportunities are applicable to any datacenter.",
acknowledgement = ack-nhfb,
articleno = "51",
}
@InProceedings{Kaushik:2012:DCC,
author = "Rini T. Kaushik and Klara Nahrstedt",
title = "{T*}: a data-centric cooling energy costs reduction
approach for big data analytics cloud",
crossref = "Hollingsworth:2012:SPI",
pages = "52:1--52:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a037.pdf",
abstract = "Explosion in Big Data has led to a surge in extremely
large-scale Big Data analytics platforms, resulting in
burgeoning energy costs. Big Data compute model
mandates strong data-locality for computational
performance, and moves computations to data.
State-of-the-art cooling energy management techniques
rely on thermal-aware computational job
placement/migration and are inherently
data-placement-agnostic in nature. T * takes a novel,
data-centric approach to reduce cooling energy costs
and to ensure thermal-reliability of the servers. T *
is cognizant of the uneven thermal-profile and
differences in thermal-reliability-driven load
thresholds of the servers, and the differences in the
computational jobs arrival rate, size, and evolution
life spans of the Big Data placed in the cluster. Based
on this knowledge, and coupled with its predictive file
models and insights, T * does proactive, thermal-aware
file placement, which implicitly results in
thermal-aware job placement in the Big Data analytics
compute model. Evaluation results with one-month long
real-world Big Data analytics production traces from
Yahoo! show up to 42\% reduction in the cooling energy
costs with T * courtesy of its lower and more uniform
thermal-profile and 9x better performance than the
state-of-the-art data-agnostic cooling techniques.",
acknowledgement = ack-nhfb,
articleno = "52",
}
@InProceedings{Ravi:2012:VVB,
author = "Vignesh T. Ravi and Michela Becchi and Gagan Agrawal
and Srimat Chakradhar",
title = "{ValuePack}: value-based scheduling framework for
{CPU--GPU} clusters",
crossref = "Hollingsworth:2012:SPI",
pages = "53:1--53:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a038.pdf",
abstract = "Heterogeneous computing nodes are becoming commonplace
today, and recent trends strongly indicate that
clusters, supercomputers, and cloud environments will
increasingly host more heterogeneous resources, with
some being massively parallel (e.g., GPU). With such
heterogeneous environments becoming common, it is
important to revisit scheduling problems for clusters
and cloud environments. In this paper, we formulate and
address the problem of value-driven scheduling of
independent jobs on heterogeneous clusters, which
captures both the urgency and relative priority of
jobs. Our overall scheduling goal is to maximize the
aggregate value or yield of all jobs. Exploiting the
portability available from the underlying programming
model, we propose four novel scheduling schemes that
can automatically and dynamically map jobs onto
heterogeneous resources. Additionally, to improve the
utilization of massively parallel resources, we also
propose heuristics to automatically decide when and
which jobs can share a single resource.",
acknowledgement = ack-nhfb,
articleno = "53",
}
@InProceedings{Preissl:2012:CSS,
author = "Robert Preissl and Theodore M. Wong and Pallab Datta
and Myron Flickner and Raghavendra Singh and Steven K.
Esser and William P. Risk and Horst D. Simon and
Dharmendra S. Modha",
title = "{Compass}: a scalable simulator for an architecture
for cognitive computing",
crossref = "Hollingsworth:2012:SPI",
pages = "54:1--54:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a085.pdf",
abstract = "Inspired by the function, power, and volume of the
organic brain, we are developing TrueNorth, a novel
modular, non-von Neumann, ultra-low power, compact
architecture. TrueNorth consists of a scalable network
of neurosynaptic cores, with each core containing
neurons, dendrites, synapses, and axons. To set sail
for TrueNorth, we developed Compass, a multi-threaded,
massively parallel functional simulator and a parallel
compiler that maps a network of long-distance pathways
in the macaque monkey brain to TrueNorth. We
demonstrate near-perfect weak scaling on a 16 rack
IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB
memory), achieving an unprecedented scale of 256
million neurosynaptic cores containing 65 billion
neurons and 16 trillion synapses running only 388X
slower than real time with an average spiking rate of
8.1 Hz. By using emerging PGAS communication
primitives, we also demonstrate 2X better real-time
performance over MPI primitives on a 4 rack Blue Gene/P
(16384 CPUs, 16 TB memory).",
acknowledgement = ack-nhfb,
articleno = "54",
}
@InProceedings{Sun:2012:OFG,
author = "Yanhua Sun and Gengbin Zheng and Chao Mei and Eric J.
Bohm and James C. Phillips and Laximant V. Kal{\'e} and
Terry R. Jones",
title = "Optimizing fine-grained communication in a
biomolecular simulation application on {Cray XK6}",
crossref = "Hollingsworth:2012:SPI",
pages = "55:1--55:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a086.pdf",
abstract = "Achieving good scaling for fine-grained communication
intensive applications on modern supercomputers remains
challenging. In our previous work, we have shown that
such an application --- NAMD --- scales well on the
full Jaguar XT5 without long-range interactions; Yet,
with them, the speedup falters beyond 64K cores.
Although the new Gemini interconnect on Cray XK6 has
improved network performance, the challenges remain,
and are likely to remain for other such networks as
well. We analyze communication bottlenecks in NAMD and
its CHARM++ runtime, using the Projections performance
analysis tool. Based on the analysis, we optimize the
runtime, built on the uGNI library for Gemini. We
present several techniques to improve the fine-grained
communication. Consequently, the performance of running
92224-atom Apoa1 with GPUs on TitanDev is improved by
36\%. For 100-million-atom STMV, we improve upon the
prior Jaguar XT5 result of 26 ms/step to 13 ms/step
using 298,992 cores on Jaguar XK6.",
acknowledgement = ack-nhfb,
articleno = "55",
}
@InProceedings{Alexeev:2012:HSL,
author = "Yuri Alexeev and Ashutosh Mahajan and Sven Leyffer and
Graham Fletcher and Dmitri G. Fedorov",
title = "Heuristic static load-balancing algorithm applied to
the fragment molecular orbital method",
crossref = "Hollingsworth:2012:SPI",
pages = "56:1--56:13",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a087.pdf",
abstract = "In the era of petascale supercomputing, the importance
of load balancing is crucial. Although dynamic load
balancing is widespread, it is increasingly difficult
to implement effectively with thousands of processors
or more, prompting a second look at static
load-balancing techniques even though the optimal
allocation of tasks to processors is an NP-hard
problem. We propose a heuristic static load-balancing
algorithm, employing fitted benchmarking data, as an
alternative to dynamic load balancing. The problem of
allocating CPU cores to tasks is formulated as a
mixed-integer nonlinear optimization problem, which is
solved by using an optimization solver. On 163,840
cores of Blue Gene/P, we achieved a parallel efficiency
of 80\% for an execution of the fragment molecular
orbital method applied to model protein-ligand
complexes quantum-mechanically. The obtained allocation
is shown to outperform dynamic load balancing by at
least a factor of 2, thus motivating the use of this
approach on other coarse-grained applications.",
acknowledgement = ack-nhfb,
articleno = "56",
}
@InProceedings{Li:2012:CSE,
author = "Dong Li and Jeffrey S. Vetter and Weikuan Yu",
title = "Classifying soft error vulnerabilities in
extreme-scale scientific applications using a binary
instrumentation tool",
crossref = "Hollingsworth:2012:SPI",
pages = "57:1--57:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a097.pdf",
abstract = "Extreme-scale scientific applications are at a
significant risk of being hit by soft errors on
supercomputers as the scale of these systems and the
component density continues to increase. In order to
better understand the specific soft error
vulnerabilities in scientific applications, we have
built an empirical fault injection and consequence
analysis tool --- BIFIT --- that allows us to evaluate
how soft errors impact applications. In particular,
BIFIT is designed with capability to inject faults at
very specific targets: an arbitrarily-chosen execution
point and any specific data structure. We apply BIFIT
to three mission-critical scientific applications and
investigate the applications vulnerability to soft
errors by performing thousands of statistical tests.
We, then, classify each applications individual data
structures based on their sensitivity to these
vulnerabilities, and generalize these classifications
across applications. Subsequently, these
classifications can be used to apply appropriate
resiliency solutions to each data structure within an
application. Our study reveals that these scientific
applications have a wide range of sensitivities to both
the time and the location of a soft error; yet, we are
able to identify intrinsic relationships between
application vulnerabilities and specific types of data
objects. In this regard, BIFIT enables new
opportunities for future resiliency research.",
acknowledgement = ack-nhfb,
articleno = "57",
}
@InProceedings{Chung:2012:CDS,
author = "Jinsuk Chung and Ikhwan Lee and Michael Sullivan and
Jee Ho Ryoo and Dong Wan Kim and Doe Hyun Yoon and
Larry Kaplan and Mattan Erez",
title = "Containment domains: a scalable, efficient, and
flexible resilience scheme for exascale systems",
crossref = "Hollingsworth:2012:SPI",
pages = "58:1--58:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a098.pdf",
abstract = "This paper describes and evaluates a scalable and
efficient resilience scheme based on the concept of
containment domains. Containment domains are a
programming construct that enable applications to
express resilience needs and to interact with the
system to tune and specialize error detection, state
preservation and restoration, and recovery schemes.
Containment domains have weak transactional semantics
and are nested to take advantage of the machine and
application hierarchies and to enable hierarchical
state preservation, restoration, and recovery. We
evaluate the scalability and efficiency of containment
domains using generalized trace-driven simulation and
analytical analysis and show that containment domains
are superior to both checkpoint restart and redundant
execution approaches.",
acknowledgement = ack-nhfb,
articleno = "58",
}
@InProceedings{Byna:2012:PAV,
author = "Surendra Byna and Jerry Chou and Oliver R{\"u}bel and
Prabhat and Homa Karimabadi and William S. Daughton and
Vadim Roytershteyn and E. Wes Bethel and Mark Howison
and Ke-Jou Hsu and Kuan-Wu Lin and Arie Shoshani and
Andrew Uselton and Kesheng Wu",
title = "Parallel {I/O}, analysis, and visualization of a
trillion particle simulation",
crossref = "Hollingsworth:2012:SPI",
pages = "59:1--59:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a103.pdf",
abstract = "Petascale plasma physics simulations have recently
entered the regime of simulating trillions of
particles. These unprecedented simulations generate
massive amounts of data, posing significant challenges
in storage, analysis, and visualization. In this paper,
we present parallel I/O, analysis, and visualization
results from a VPIC trillion particle simulation
running on 120,000 cores, which produces ~ 30 TB of
data for a single timestep. We demonstrate the
successful application of H5Part, a particle data
extension of parallel HDF5, for writing the dataset at
a significant fraction of system peak I/O rates. To
enable efficient analysis, we develop hybrid parallel
FastQuery to index and query data using multi-core CPUs
on distributed memory hardware. We show good
scalability results for the FastQuery implementation
using up to 10,000 cores. Finally, we apply this
indexing/query-driven approach to facilitate the
first-ever analysis and visualization of the trillion
particle dataset.",
acknowledgement = ack-nhfb,
articleno = "59",
}
@InProceedings{Kanov:2012:DIS,
author = "Kalin Kanov and Randal Burns and Greg Eyink and
Charles Meneveau and Alexander Szalay",
title = "Data-intensive spatial filtering in large numerical
simulation datasets",
crossref = "Hollingsworth:2012:SPI",
pages = "60:1--60:9",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a101.pdf",
abstract = "We present a query processing framework for the
efficient evaluation of spatial filters on large
numerical simulation datasets stored in a
data-intensive cluster. Previously, filtering of large
numerical simulations stored in scientific databases
has been impractical owing to the immense data
requirements. Rather, filtering is done during
simulation or by loading snapshots into the aggregate
memory of an HPC cluster. Our system performs filtering
within the database and supports large filter widths.
We present two complementary methods of execution: I/O
streaming computes a batch filter query in a single
sequential pass using incremental evaluation of
decomposable kernels, summed volumes generates an
intermediate data set and evaluates each filtered value
by accessing only eight points in this dataset. We
dynamically choose between these methods depending upon
workload characteristics. The system allows us to
perform filters against large data sets with little
overhead: query performance scales with the cluster's
aggregate I/O throughput.",
acknowledgement = ack-nhfb,
articleno = "60",
}
@InProceedings{Nouanesengsy:2012:PPA,
author = "Boonthanome Nouanesengsy and Teng-Yok Lee and Kewei Lu
and Han-Wei Shen and Tom Peterka",
title = "Parallel particle advection and {FTLE} computation for
time-varying flow fields",
crossref = "Hollingsworth:2012:SPI",
pages = "61:1--61:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a102.pdf",
abstract = "Flow fields are an important product of scientific
simulations. One popular flow visualization technique
is particle advection, in which seeds are traced
through the flow field. One use of these traces is to
compute a powerful analysis tool called the Finite-Time
Lyapunov Exponent (FTLE) field, but no existing
particle tracing algorithms scale to the particle
injection frequency required for high-resolution FTLE
analysis. In this paper, a framework to trace the
massive number of particles necessary for FTLE
computation is presented. A new approach is explored,
in which processes are divided into groups, and are
responsible for mutually exclusive spans of time. This
pipelining over time intervals reduces overall idle
time of processes and decreases I/O overhead. Our
parallel FTLE framework is capable of advecting
hundreds of millions of particles at once, with
performance scaling up to tens of thousands of
processes.",
acknowledgement = ack-nhfb,
articleno = "61",
}
@InProceedings{Patwary:2012:NSP,
author = "Mostofa Ali Patwary and Diana Palsetia and Ankit
Agrawal and Wei-keng Liao and Fredrik Manne and Alok
Choudhary",
title = "A new scalable parallel {DBSCAN} algorithm using the
disjoint-set data structure",
crossref = "Hollingsworth:2012:SPI",
pages = "62:1--62:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a053.pdf",
abstract = "DBSCAN is a well-known density based clustering
algorithm capable of discovering arbitrary shaped
clusters and eliminating noise data. However,
parallelization of Dbscan is challenging as it exhibits
an inherent sequential data access order. Moreover,
existing parallel implementations adopt a master-slave
strategy which can easily cause an unbalanced workload
and hence result in low parallel efficiency. We present
a new parallel Dbscan algorithm (Pdsdbscan) using graph
algorithmic concepts. More specifically, we employ the
disjoint-set data structure to break the access
sequentiality of Dbscan. In addition, we use a
tree-based bottom-up approach to construct the
clusters. This yields a better-balanced workload
distribution. We implement the algorithm both for
shared and for distributed memory. Using data sets
containing up to several hundred million
high-dimensional points, we show that Pdsdbscan
significantly outperforms the master-slave approach,
achieving speedups up to 25.97 using 40 cores on shared
memory architecture, and speedups up to 5,765 using
8,192 cores on distributed memory architecture.",
acknowledgement = ack-nhfb,
articleno = "62",
}
@InProceedings{Nikolova:2012:PBN,
author = "Olga Nikolova and Srinivas Aluru",
title = "Parallel {Bayesian} network structure learning with
application to gene networks",
crossref = "Hollingsworth:2012:SPI",
pages = "63:1--63:9",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a052.pdf",
abstract = "Bayesian networks (BN) are probabilistic graphical
models which are widely utilized in various research
areas, including modeling complex biological
interactions in the cell. Learning the structure of a
BN is an NP-hard problem and exact solutions are
limited to a few tens of variables. In this work, we
present a parallel BN structure learning algorithm that
combines principles of both heuristic and exact
approaches and facilitates learning of larger networks.
We demonstrate the applicability of our approach by an
implementation on a Cray AMD cluster, and present
experimental results for the problem of inferring gene
networks. Our approach is work-optimal and achieves
nearly perfect scaling.",
acknowledgement = ack-nhfb,
articleno = "63",
}
@InProceedings{Khan:2012:MAN,
author = "Arif M. Khan and David F. Gleich and Alex Pothen and
Mahantesh Halappanavar",
title = "A multithreaded algorithm for network alignment via
approximate matching",
crossref = "Hollingsworth:2012:SPI",
pages = "64:1--64:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a054.pdf",
abstract = "Network alignment is an optimization problem to find
the best one-to-one map between the vertices of a pair
of graphs that overlaps as many edges as possible. It
is a relaxation of the graph isomorphism problem and is
closely related to the subgraph isomorphism problem.
The best current approaches are entirely heuristic and
iterative in nature. They generate real-valued
heuristic weights that must be rounded to find integer
solutions. This rounding requires solving a bipartite
maximum weight matching problem at each iteration in
order to avoid missing high quality solutions. We
investigate substituting a parallel, half-approximation
for maximum weight matching instead of an exact
computation. Our experiments show that the resulting
difference in solution quality is negligible. We
demonstrate almost a 20-fold speedup using 40 threads
on an 8 processor Intel Xeon E7-8870 system and now
solve real-world problems in 36 seconds instead of 10
minutes.",
acknowledgement = ack-nhfb,
articleno = "64",
}
@InProceedings{Olivier:2012:CMW,
author = "Stephen L. Olivier and Bronis R. de Supinski and
Martin Schulz and Jan F. Prins",
title = "Characterizing and mitigating work time inflation in
task parallel programs",
crossref = "Hollingsworth:2012:SPI",
pages = "65:1--65:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a066.pdf",
abstract = "Task parallelism raises the level of abstraction in
shared memory parallel programming to simplify the
development of complex applications. However, task
parallel applications can exhibit poor performance due
to thread idleness, scheduling overheads, and work time
inflation --- additional time spent by threads in a
multithreaded computation beyond the time required to
perform the same work in a sequential computation. We
identify the contributions of each factor to lost
efficiency in various task parallel OpenMP applications
and diagnose the causes of work time inflation in those
applications. Increased data access latency can cause
significant work time inflation in NUMA systems. Our
locality framework for task parallel OpenMP programs
mitigates this cause of work time inflation. Our
extensions to the Qthreads library demonstrate that
locality-aware scheduling can improve performance up to
3X compared to the Intel OpenMP task scheduler.",
acknowledgement = ack-nhfb,
articleno = "65",
}
@InProceedings{Bauer:2012:LEL,
author = "Michael Bauer and Sean Treichler and Elliott Slaughter
and Alex Aiken",
title = "{Legion}: expressing locality and independence with
logical regions",
crossref = "Hollingsworth:2012:SPI",
pages = "66:1--66:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a065.pdf",
abstract = "Modern parallel architectures have both heterogeneous
processors and deep, complex memory hierarchies. We
present Legion, a programming model and runtime system
for achieving high performance on these machines.
Legion is organized around logical regions, which
express both locality and independence of program data,
and tasks, functions that perform computations on
regions. We describe a runtime system that dynamically
extracts parallelism from Legion programs, using a
distributed, parallel scheduling algorithm that
identifies both independent tasks and nested
parallelism. Legion also enables explicit, programmer
controlled movement of data through the memory
hierarchy and placement of tasks based on locality
information via a novel mapping interface. We evaluate
our Legion implementation on three applications:
fluid-flow on a regular grid, a three-level AMR code
solving a heat diffusion equation, and a circuit
simulation.",
acknowledgement = ack-nhfb,
articleno = "66",
}
@InProceedings{Garland:2012:DUP,
author = "Michael Garland and Manjunath Kudlur and Yili Zheng",
title = "Designing a unified programming model for
heterogeneous machines",
crossref = "Hollingsworth:2012:SPI",
pages = "67:1--67:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a064.pdf",
abstract = "While high-efficiency machines are increasingly
embracing heterogeneous architectures and massive
multithreading, contemporary mainstream programming
languages reflect a mental model in which processing
elements are homogeneous, concurrency is limited, and
memory is a flat undifferentiated pool of storage.
Moreover, the current state of the art in programming
heterogeneous machines tends towards using separate
programming models, such as OpenMP and CUDA, for
different portions of the machine. Both of these
factors make programming emerging heterogeneous
machines unnecessarily difficult. We describe the
design of the Phalanx programming model, which seeks to
provide a unified programming model for heterogeneous
machines. It provides constructs for bulk parallelism,
synchronization, and data placement which operate
across the entire machine. Our prototype implementation
is able to launch and coordinate work on both CPU and
GPU processors within a single node, and by leveraging
the GASNet runtime, is able to run across all the nodes
of a distributed-memory machine.",
acknowledgement = ack-nhfb,
articleno = "67",
}
@InProceedings{Sharma:2012:DII,
author = "Sushant Sharma and Dimitrios Katramatos and Dantong Yu
and Li Shi",
title = "Design and implementation of an intelligent end-to-end
network {QoS} system",
crossref = "Hollingsworth:2012:SPI",
pages = "68:1--68:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a077.pdf",
abstract = "End-to-End guaranteed network QoS is a requirement for
predictable data transfers between geographically
distant end-hosts. Existing QoS systems, however, do
not have the capability/intelligence to decide what
resources to reserve and which paths to choose when
there are multiple and flexible resource reservation
requests. In this paper, we design and implement an
intelligent system that can guarantee end-to-end
network QoS for multiple flexible reservation requests.
At the heart of this system is a polynomial time
algorithm called resource reservation and path
construction (RRPC). The RRPC algorithm schedules
multiple flexible end-to-end data transfer requests by
jointly optimizing the path construction and bandwidth
reservation along these paths. We show that
constructing such schedules is NP-hard. We implement
our intelligent QoS system, and present the results of
deployment on real world production networks (ESnet and
Internet2). Our implementation does not require
modifications or new software to be deployed on the
routers within network.",
acknowledgement = ack-nhfb,
articleno = "68",
}
@InProceedings{Chen:2012:LUH,
author = "Dong Chen and Noel Eisley and Philip Heidelberger and
Sameer Kumar and Amith Mamidala and Fabrizio Petrini
and Robert Senger and Yutaka Sugawara and Robert Walkup
and Burkhard Steinmacher-Burow and Anamitra Choudhury
and Yogish Sabharwal and Swati Singhal and Jeffrey J.
Parker",
title = "Looking under the hood of the {IBM Blue Gene/Q}
network",
crossref = "Hollingsworth:2012:SPI",
pages = "69:1--69:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a078.pdf",
abstract = "This paper explores the performance and optimization
of the IBM Blue Gene/Q (BG/Q) five dimensional torus
network on up to 16K nodes. The BG/Q hardware supports
multiple dynamic routing algorithms and different
traffic patterns may require different algorithms to
achieve best performance. Between 85\% to 95\% of peak
network performance is achieved for all-to-all traffic,
while over 85\% of peak is obtained for challenging
bisection pairings. A new software-controlled algorithm
is developed for bisection traffic that selects which
hardware algorithm to employ and achieves better
performance than any individual hardware algorithm. The
benefit of dynamic routing is shown for a highly
non-uniform ``transpose'' traffic pattern. To evaluate
memory and network performance, the HPCC Random Access
benchmark was tuned for BG/Q and achieved 858 Giga
Updates per Second (GUPS) on 16K nodes. To further
accelerate message processing, the message libraries on
BG/Q enable the offloading of messaging overhead onto
dedicated communication threads. Several applications,
including Algebraic Multigrid (AMG), exhibit from 3 to
20\% gain using communication threads.",
acknowledgement = ack-nhfb,
articleno = "69",
}
@InProceedings{Subramoni:2012:DSI,
author = "H. Subramoni and S. Potluri and K. Kandalla and B.
Barth and J. Vienne and J. Keasler and K. Tomko and K.
Schulz and A. Moody and D. K. Panda",
title = "Design of a scalable {InfiniBand} topology service to
enable network-topology-aware placement of processes",
crossref = "Hollingsworth:2012:SPI",
pages = "70:1--70:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a076.pdf",
abstract = "Over the last decade, InfiniBand has become an
increasingly popular interconnect for deploying modern
super-computing systems. However, there exists no
detection service that can discover the underlying
network topology in a scalable manner and expose this
information to runtime libraries and users of the high
performance computing systems in a convenient way. In
this paper, we design a novel and scalable method to
detect the InfiniBand network topology by using
Neighbor-Joining techniques (NJ). To the best of our
knowledge, this is the first instance where the
neighbor joining algorithm has been applied to solve
the problem of detecting InfiniBand network topology.
We also design a network-topology-aware MPI library
that takes advantage of the network topology service.
The library places processes taking part in the MPI job
in a network-topology-aware manner with the dual aim of
increasing intra-node communication and reducing the
long distance inter-node communication across the
InfiniBand fabric.",
acknowledgement = ack-nhfb,
articleno = "70",
}
@InProceedings{Chen:2012:CLA,
author = "Guancheng Chen and Per Stenstrom",
title = "Critical lock analysis: diagnosing critical section
bottlenecks in multithreaded applications",
crossref = "Hollingsworth:2012:SPI",
pages = "71:1--71:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a099.pdf",
abstract = "Critical sections are well known potential performance
bottlenecks in multithreaded applications and
identifying the ones that inhibit scalability are
important for performance optimizations. While previous
approaches use idle time as a key measure, we show such
a measure is not reliable. The reason is that idleness
does not necessarily mean the critical section is on
the critical path. We introduce critical lock analysis,
a new method for diagnosing critical section
bottlenecks in multithreaded applications. Our method
firstly identifies the critical sections appearing on
the critical path, and then quantifies the impact of
such critical sections on the overall performance by
using quantitative performance metrics. Case studies
show that our method can successfully identify critical
sections that are most beneficial for improving overall
performance as well as quantify their performance
impact on the critical path, which results in a more
reliable establishment of the inherent critical section
bottlenecks than previous approaches.",
acknowledgement = ack-nhfb,
articleno = "71",
}
@InProceedings{Ravishankar:2012:CGP,
author = "Mahesh Ravishankar and John Eisenlohr and
Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas
Rountev and P. Sadayappan",
title = "Code generation for parallel execution of a class of
irregular loops on distributed memory systems",
crossref = "Hollingsworth:2012:SPI",
pages = "72:1--72:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a100.pdf",
abstract = "Parallelization and locality optimization of affine
loop nests has been successfully addressed for
shared-memory machines. However, many large-scale
simulation applications must be executed in a
distributed-memory environment, and use
irregular/sparse computations where the control-flow
and array-access patterns are data-dependent. In this
paper, we propose an approach for effective parallel
execution of a class of irregular loop computations in
a distributed-memory environment, using a combination
of static and runtime analysis. We discuss algorithms
that analyze sequential code to generate an inspector
and an executor. The inspector captures the
data-dependent behavior of the computation in parallel
and without requiring complete replication of any of
the data structures used in the original computation.
The executor performs the computation in parallel. The
effectiveness of the framework is demonstrated on
several benchmarks and a climate modeling
application.",
acknowledgement = ack-nhfb,
articleno = "72",
}
@InProceedings{Alimi:2012:FEF,
author = "Jean-Michel Alimi and Vincent Bouillot and Yann Rasera
and Vincent Reverdy and Pier-Stefano Corasaniti and
Ir{\`e}ne Balm{\`e}s and St{\'e}phane Requena and
Xavier Delaruelle and Jean-Noel Richet",
title = "First-ever full observable universe simulation",
crossref = "Hollingsworth:2012:SPI",
pages = "73:1--73:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a034.pdf",
abstract = "We have performed the first-ever numerical N-body
simulation of the full observable universe (DEUS ``Dark
Energy Universe Simulation'' FUR ``Full Universe
Run''). This has evolved 550 billion particles on an
Adaptive Mesh Refinement grid with more than two
trillion computing points along the entire evolutionary
history of the universe and across 6 order of
magnitudes length scales, from the size of the Milky
Way to that of the whole observable Universe. To date,
this is the largest and most advanced cosmological
simulation ever run. It provides unique information on
the formation and evolution of the largest structure in
the universe and an exceptional support to future
observational programs dedicated to mapping the
distribution of matter and galaxies in the universe.
The simulation has run on 4752 (of 5040) thin nodes of
BULL supercomputer CURIE, using more than 300 TB of
memory for 10 million hours of computing time. About 50
PBytes of data were generated throughout the run. Using
an advanced and innovative reduction workflow the
amount of useful stored data has been reduced to 500
TBytes.",
acknowledgement = ack-nhfb,
articleno = "73",
}
@InProceedings{March:2012:OCP,
author = "William B. March and Kenneth Czechowski and Marat
Dukhan and Thomas Benson and Dongryeol Lee and Andrew
J. Connolly and Richard Vuduc and Edmond Chow and
Alexander G. Gray",
title = "Optimizing the computation of $n$-point correlations
on large-scale astronomical data",
crossref = "Hollingsworth:2012:SPI",
pages = "74:1--74:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a033.pdf",
abstract = "The n-point correlation functions (npcf) are powerful
statistics that are widely used for data analyses in
astronomy and other fields. These statistics have
played a crucial role in fundamental physical
breakthroughs, including the discovery of dark energy.
Unfortunately, directly computing the npcf at a single
value requires O ( N$^n$ ) time for N points and values
of n of 2, 3, 4, or even larger. Astronomical data sets
can contain billions of points, and the next generation
of surveys will generate terabytes of data per night.
To meet these computational demands, we present a
highly-tuned npcf computation code that show an
order-of-magnitude speedup over current
state-of-the-art. This enables a much larger 3-point
correlation computation on the galaxy distribution than
was previously possible. We show a detailed performance
evaluation on many different architectures.",
acknowledgement = ack-nhfb,
articleno = "74",
}
@InProceedings{Wu:2012:HTM,
author = "Jingjin Wu and Zhiling Lan and Xuanxing Xiong and
Nickolay Y. Gnedin and Andrey V. Kravtsov",
title = "Hierarchical task mapping of cell-based {AMR}
cosmology simulations",
crossref = "Hollingsworth:2012:SPI",
pages = "75:1--75:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a035.pdf",
abstract = "Cosmology simulations are highly
communication-intensive, thus it is critical to exploit
topology-aware task mapping techniques for performance
optimization. To exploit the architectural properties
of multiprocessor clusters (the performance gap between
inter-node and intra-node communication as well as the
gap between inter-socket and intra-socket
communication), we design and develop a hierarchical
task mapping scheme for cell-based AMR (Adaptive Mesh
Refinement) cosmology simulations, in particular, the
ART application. Our scheme consists of two parts: (1)
an inter-node mapping to map application processes onto
nodes with the objective of minimizing network traffic
among nodes and (2) an intra-node mapping within each
node to minimize the maximum size of messages
transmitted between CPU sockets. Experiments on
production supercomputers with 3D torus and fat-tree
topologies show that our scheme can significantly
reduce application communication cost by up to 50\%.
More importantly, our scheme is generic and can be
extended to many other applications.",
acknowledgement = ack-nhfb,
articleno = "75",
}
@InProceedings{Sridharan:2012:SDF,
author = "Vilas Sridharan and Dean Liberty",
title = "A study of {DRAM} failures in the field",
crossref = "Hollingsworth:2012:SPI",
pages = "76:1--76:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a047.pdf",
abstract = "Most modern computer systems use dynamic random access
memory (DRAM) as a main memory store. Recent
publications have confirmed that DRAM errors are a
common source of failures in the field. Therefore,
further attention to the faults experienced by DRAM
sub-systems is warranted. In this paper, we present a
study of 11 months of DRAM errors in a large
high-performance computing cluster. Our goal is to
understand the failure modes, rates, and fault types
experienced by DRAM in production settings. We identify
several unique DRAM failure modes, including
single-bit, multi-bit, and multi-chip failures. We also
provide a deterministic bound on the rate of transient
faults in the DRAM array, by exploiting the presence of
a hardware scrubber on our nodes. We draw several
conclusions from our study. First, DRAM failures are
dominated by permanent, rather than transient, faults,
although not to the extent found by previous
publications. Second, DRAMs are susceptible to large
multi-bit failures, such as failures that affect an
entire DRAM row or column, indicating faults in shared
internal circuitry. Third, we identify a DRAM failure
mode that disrupts access to other DRAM devices that
share the same board-level circuitry. Finally, we find
that chipkill error-correcting codes (ECC) are
extremely effective, reducing the node failure rate
from uncorrected DRAM errors by 42x compared to
single-error correct/double-error detect (SEC-DED)
ECC.",
acknowledgement = ack-nhfb,
articleno = "76",
}
@InProceedings{Gainaru:2012:FPU,
author = "Ana Gainaru and Franck Cappello and Marc Snir and
William Kramer",
title = "Fault prediction under the microscope: a closer look
into {HPC} systems",
crossref = "Hollingsworth:2012:SPI",
pages = "77:1--77:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a048.pdf",
abstract = "A large percentage of computing capacity in today's
large high-performance computing systems is wasted
because of failures. Consequently current research is
focusing on providing fault tolerance strategies that
aim to minimize fault's effects on applications. By far
the most popular technique is the checkpoint-restart
strategy. A complement to this classical approach is
failure avoidance, by which the occurrence of a fault
is predicted and preventive measures are taken. This
requires a reliable prediction system to anticipate
failures and their locations. Thus far, research in
this field has used ideal predictors that were not
implemented in real HPC systems. In this paper, we
merge signal analysis concepts with data mining
techniques to extend the ELSA (Event Log Signal
Analyzer) toolkit and offer an adaptive and more
efficient prediction module. Our goal is to provide
models that characterize the normal behavior of a
system and the way faults affect it. Being able to
detect deviations from normality quickly is the
foundation of accurate fault prediction. However, this
is challenging because component failure dynamics are
heterogeneous in space and time. To this end, a large
part of the paper is focused on a detailed analysis of
the prediction method, by applying it to two
large-scale systems and by investigating the
characteristics and bottlenecks of each step of the
prediction process. Furthermore, we analyze the
prediction's precision and recall impact on current
checkpointing strategies and highlight future
improvements and directions for research in this
field.",
acknowledgement = ack-nhfb,
articleno = "77",
}
@InProceedings{Fiala:2012:DCS,
author = "David Fiala and Frank Mueller and Christian Engelmann
and Rolf Riesen and Kurt Ferreira and Ron Brightwell",
title = "Detection and correction of silent data corruption for
large-scale high-performance computing",
crossref = "Hollingsworth:2012:SPI",
pages = "78:1--78:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a046.pdf",
abstract = "Faults have become the norm rather than the exception
for high-end computing clusters. Exacerbating this
situation, some of these faults remain undetected,
manifesting themselves as silent errors that allow
applications to compute incorrect results. This paper
studies the potential for redundancy to detect and
correct soft errors in MPI message-passing applications
while investigating the challenges inherent to
detecting soft errors within MPI applications by
providing transparent MPI redundancy. By assuming a
model wherein corruption in application data manifests
itself by producing differing MPI messages between
replicas, we study the best suited protocols for
detecting and correcting corrupted MPI messages. Using
our fault injector, we observe that even a single error
can have profound effects on applications by causing a
cascading pattern of corruption which in most cases
spreads to all other processes. Results indicate that
our consistency protocols can successfully protect
applications experiencing even high rates of silent
data corruption.",
acknowledgement = ack-nhfb,
articleno = "78",
}
@InProceedings{Karpenko:2012:AGW,
author = "Dmytro Karpenko and Roman Vitenberg and Alexander L.
Read",
title = "{ATLAS} grid workload on {NDGF} resources: analysis,
modeling, and workload generation",
crossref = "Hollingsworth:2012:SPI",
pages = "79:1--79:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a055.pdf",
abstract = "Evaluating new ideas for job scheduling or data
transfer algorithms in large-scale grid systems is
known to be notoriously challenging. Existing grid
simulators expect to receive a realistic workload as an
input. Such input is difficult to provide in absence of
an in-depth study of representative grid workloads. In
this work, we analyze the ATLAS workload processed on
the resources of NDG Facility. ATLAS is one of the
biggest grid technology users, with extreme demands for
CPU power and bandwidth. The analysis is based on the
data sample with ~1.6 million jobs, 1,723 TB of data
transfer, and 873 years of processor time. Our
additional contributions are (a) scalable workload
models that can be used to generate a synthetic
workload for a given number of jobs, (b) an open-source
workload generator software integrated with existing
grid simulators, and (c) suggestions for grid system
designers based on the insights of data analysis.",
acknowledgement = ack-nhfb,
articleno = "79",
}
@InProceedings{Estrada:2012:EAA,
author = "Trilce Estrada and Michela Taufer",
title = "On the effectiveness of application-aware
self-management for scientific discovery in volunteer
computing systems",
crossref = "Hollingsworth:2012:SPI",
pages = "80:1--80:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a057.pdf",
abstract = "An important challenge faced by high-throughput,
multiscale applications is that human intervention has
a central role in driving their success. However,
manual intervention is in-efficient, error-prone and
promotes resource wasting. This paper presents an
application-aware modular framework that provides
self-management for computational multiscale
applications in volunteer computing (VC). Our framework
consists of a learning engine and three modules that
can be easily adapted to different distributed systems.
The learning engine of this framework is based on our
novel tree-like structure called KOTree. KOTree is a
fully automatic method that organizes statistical
information in a multidimensional structure that can be
efficiently searched and updated at runtime. Our
empirical evaluation shows that our framework can
effectively provide application-aware self-management
in VC systems. Additionally, we observed that our
KOTree algorithm is able to predict accurately the
expected length of new jobs, resulting in an average of
85\% increased throughput with respect to other
algorithms.",
acknowledgement = ack-nhfb,
articleno = "80",
}
@InProceedings{Liu:2012:UVC,
author = "Z. Liu and M. Veeraraghavan and Z. Yan and C. Tracy
and J. Tie and I. Foster and J. Dennis and J. Hick and
Y. Li and W. Yang",
title = "On using virtual circuits for {GridFTP} transfers",
crossref = "Hollingsworth:2012:SPI",
pages = "81:1--81:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a056.pdf",
abstract = "The goal of this work is to characterize scientific
data transfers and to determine the suitability of
dynamic virtual circuit service for these transfers
instead of the currently used IP-routed service.
Specifically, logs collected by servers executing a
commonly used scientific data transfer application,
GridFTP, are obtained from three US
super-computing/scientific research centers, NERSC,
SLAC, and NCAR, and analyzed. Dynamic virtual circuit
(VC) service, a relatively new offering from providers
such as ESnet and Internet2, allows for the selection
of a path on which a rate-guaranteed connection is
established prior to data transfer. Given VC setup
overhead, the first analysis of the GridFTP transfer
logs characterizes the duration of sessions, where a
session consists of multiple back-to-back transfers
executed in batch mode between the same two GridFTP
servers. Of the NCAR-NICS sessions analyzed, 56\% of
all sessions (90\% of all transfers) would have been
long enough to be served with dynamic VC service. An
analysis of transfer logs across four paths, NCAR-NICS,
SLAC-BNL, NERSC-ORNL and NERSC-ANL, shows significant
throughput variance, where NICS, BNL, ORNL, and ANL are
other US national laboratories. For example, on the
NERSC-ORNL path, the inter-quartile range was 695 Mbps,
with a maximum value of 3.64 Gbps and a minimum value
of 758 Mbps. An analysis of the impact of various
factors that are potential causes of this variance is
also presented.",
acknowledgement = ack-nhfb,
articleno = "81",
}
@InProceedings{Meng:2012:DDG,
author = "Jiayuan Meng and Vitali A. Morozov and Venkatram
Vishwanath and Kalyan Kumaran",
title = "Dataflow-driven {GPU} performance projection for
multi-kernel transformations",
crossref = "Hollingsworth:2012:SPI",
pages = "82:1--82:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a092.pdf",
abstract = "Applications often have a sequence of parallel
operations to be offloaded to graphics processors; each
operation can become an individual GPU kernel.
Developers typically explore a variety of
transformations for each kernel. Furthermore, it is
well known that efficient data management is critical
in achieving high GPU performance and that ``fusing''
multiple kernels into one may greatly improve data
locality. Doing so, however, requires transformations
across multiple, potentially nested, parallel loops; at
the same time, the original code semantics and data
dependency must be preserved. Since each kernel may
have distinct data access patterns, their combined
dataflow can be nontrivial. As a result, the complexity
of multi-kernel transformations often leads to
significant effort with no guarantee of performance
benefits. This paper proposes a dataflow-driven
analytical framework to project GPU performance for a
sequence of parallel operations. Users need only
provide CPU code skeletons for a sequence of parallel
loops. The framework can then automatically identify
opportunities for multi-kernel transformations and data
management. It is also able to project the overall
performance without implementing GPU code or using
physical hardware.",
acknowledgement = ack-nhfb,
articleno = "82",
}
@InProceedings{Dwyer:2012:PME,
author = "Tyler Dwyer and Alexandra Fedorova and Sergey
Blagodurov and Mark Roth and Fabien Gaud and Jian Pei",
title = "A practical method for estimating performance
degradation on multicore processors, and its
application to {HPC} workloads",
crossref = "Hollingsworth:2012:SPI",
pages = "83:1--83:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a093.pdf",
abstract = "When multiple threads or processes run on a multi-core
CPU they compete for shared resources, such as caches
and memory controllers, and can suffer performance
degradation as high as 200\%. We design and evaluate a
new machine learning model that estimates this
degradation online, on previously unseen workloads, and
without perturbing the execution. Our motivation is to
help data center and HPC cluster operators effectively
use workload consolidation. Data center consolidation
is about placing many applications on the same server
to maximize hardware utilization. In HPC clusters,
processes of the same distributed applications run on
the same machine. Consolidation improves hardware
utilization, but may sacrifice performance as processes
compete for resources. Our model helps determine when
consolidation is overly harmful to performance. Our
work is the first to apply machine learning to this
problem domain, and we report on our experience reaping
the advantages of machine learning while navigating
around its limitations. We demonstrate how the model
can be used to improve performance fidelity and save
energy for HPC workloads.",
acknowledgement = ack-nhfb,
articleno = "83",
}
@InProceedings{Spafford:2012:ADS,
author = "Kyle L. Spafford and Jeffrey S. Vetter",
title = "{Aspen}: a domain specific language for performance
modeling",
crossref = "Hollingsworth:2012:SPI",
pages = "84:1--84:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a091.pdf",
abstract = "We present a new approach to analytical performance
modeling using Aspen, a domain specific language. Aspen
(Abstract Scalable Performance Engineering Notation)
fills an important gap in existing performance modeling
techniques and is designed to enable rapid exploration
of new algorithms and architectures. It includes a
formal specification of an application's performance
behavior and an abstract machine model. We provide an
overview of Aspen's features and demonstrate how it can
be used to express a performance model for a three
dimensional Fast Fourier Transform. We then demonstrate
the composability and modularity of Aspen by importing
and reusing the FFT model in a molecular dynamics
model. We have also created a number of tools that
allow scientists to balance application and system
factors quickly and accurately.",
acknowledgement = ack-nhfb,
articleno = "84",
}
@InProceedings{Zhang:2012:DAD,
author = "Zhao Zhang and Daniel S. Katz and Justin M. Wozniak
and Allan Espinosa and Ian Foster",
title = "Design and analysis of data management in scalable
parallel scripting",
crossref = "Hollingsworth:2012:SPI",
pages = "85:1--85:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a016.pdf",
abstract = "We seek to enable efficient large-scale parallel
execution of applications in which a shared filesystem
abstraction is used to couple many tasks. Such parallel
scripting ( many-task computing, MTC ) applications
suffer poor performance and utilization on large
parallel computers because of the volume of filesystem
I/O and a lack of appropriate optimizations in the
shared filesystem. Thus, we design and implement a
scalable MTC data management system that uses
aggregated compute node local storage for more
efficient data movement strategies. We co-design the
data management system with the data-aware scheduler to
enable dataflow pattern identification and automatic
optimization. The framework reduces the time to
solution of parallel stages of an astronomy data
analysis application, Montage, by 83.2\% on 512 cores;
decreases the time to solution of a seismology
application, CyberShake, by 7.9\% on 2,048 cores; and
delivers BLAST performance better than mpiBLAST at
various scales up to 32,768 cores, while preserving the
flexibility of the original BLAST application.",
acknowledgement = ack-nhfb,
articleno = "85",
}
@InProceedings{Adams:2012:UBL,
author = "Ian F. Adams and Brian A. Madden and Joel C. Frank and
Mark W. Storer and Ethan L. Miller and Gene Harano",
title = "Usage behavior of a large-scale scientific archive",
crossref = "Hollingsworth:2012:SPI",
pages = "86:1--86:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a017.pdf",
abstract = "Archival storage systems for scientific data have been
growing in both size and relevance over the past two
decades, yet researchers and system designers alike
must rely on limited and obsolete knowledge to guide
archival management and design. To address this issue,
we analyzed three years of file-level activities from
the NCAR mass storage system, providing valuable
insight into a large-scale scientific archive with over
1600 users, tens of millions of files, and petabytes of
data. Our examination of system usage showed that,
while a subset of users were responsible for most of
the activity, this activity was widely distributed at
the file level. We also show that the physical grouping
of files and directories on media can improve archival
storage system performance. Based on our observations,
we provide suggestions and guidance for both future
scientific archival system designs as well as improved
tracing of archival activity.",
acknowledgement = ack-nhfb,
articleno = "86",
}
@InProceedings{LaFon:2012:DFT,
author = "Jharrod LaFon and Satyajayant Misra and Jon
Bringhurst",
title = "On distributed file tree walk of parallel file
systems",
crossref = "Hollingsworth:2012:SPI",
pages = "87:1--87:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a015.pdf",
abstract = "Supercomputers generate vast amounts of data,
typically organized into large directory hierarchies on
parallel file systems. While the supercomputing
applications are parallel, the tools used to process
them requiring complete directory traversal, are
typically serial. We present an algorithm framework and
three fully distributed algorithms for traversing large
parallel file systems, and performing file operations
in parallel. The first algorithm introduces a
randomized work-stealing scheduler; the second improves
the first with proximity-awareness; and the third
improves upon the second by using a hybrid approach. We
have tested our implementation on Cielo, a 1.37
petaflop supercomputer at the Los Alamos National
Laboratory and its 7 petabyte file system. Test results
show that our algorithms execute orders of magnitude
faster than state-of-the-art algorithms while achieving
ideal load balancing and low communication cost. We
present performance insights from the use of our
algorithms in production systems at LANL, performing
daily file system operations.",
acknowledgement = ack-nhfb,
articleno = "87",
}
@InProceedings{Chung:2012:ADP,
author = "I-Hsin Chung and Changhoan Kim and Hui-Fang Wen and
Guojing Cong",
title = "Application data prefetching on the {IBM Blue Gene/Q}
supercomputer",
crossref = "Hollingsworth:2012:SPI",
pages = "88:1--88:8",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a075.pdf",
abstract = "Memory access latency is often a crucial performance
limitation for high performance computing. Prefetching
is one of the strategies used by system designers to
bridge the processor-memory gap. This paper describes a
new innovative list prefetching feature introduced in
the IBM Blue Gene/Q supercomputer. The list prefetcher
records the L1 cache miss addresses and prefetches them
in the next iteration. The evaluation shows this list
prefetching mechanism reduces data fetching time when
L1 cache misses happen and improves the performance for
high performance computing applications with repeating
non-uniform memory access patterns. Its performance is
compatible with classic stream prefetcher when properly
configured.",
acknowledgement = ack-nhfb,
articleno = "88",
}
@InProceedings{Alvarez:2012:HSC,
author = "Lluc Alvarez and Llu{\'\i}s Vilanova and Marc Gonzalez
and Xavier Martorell and Nacho Navarro and Eduard
Ayguade",
title = "Hardware-software coherence protocol for the
coexistence of caches and local memories",
crossref = "Hollingsworth:2012:SPI",
pages = "89:1--89:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a074.pdf",
abstract = "Cache coherence protocols limit the scalability of
chip multiprocessors. One solution is to introduce a
local memory alongside the cache hierarchy, forming a
hybrid memory system. Local memories are more
power-efficient than caches and they do not generate
coherence traffic but they suffer from poor
programmability. When non-predictable memory access
patterns are found compilers do not succeed in
generating code because of the incoherency between the
two storages. This paper proposes a coherence protocol
for hybrid memory systems that allows the compiler to
generate code even in the presence of memory aliasing
problems. Coherency is ensured by a simple
software/hardware co-design where the compiler
identifies potentially incoherent memory accesses and
the hardware diverts them to the correct copy of the
data. The coherence protocol introduces overheads of
0.24\% in execution time and of 1.06\% in energy
consumption to enable the usage of the hybrid memory
system.",
acknowledgement = ack-nhfb,
articleno = "89",
}
@InProceedings{Schindewolf:2012:WSA,
author = "Martin Schindewolf and Barna Bihari and John
Gyllenhaal and Martin Schulz and Amy Wang and Wolfgang
Karl",
title = "What scientific applications can benefit from hardware
transactional memory?",
crossref = "Hollingsworth:2012:SPI",
pages = "90:1--90:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a073.pdf",
abstract = "Achieving efficient and correct synchronization of
multiple threads is a difficult and error-prone task at
small scale and, as we march towards extreme scale
computing, will be even more challenging when the
resulting application is supposed to utilize millions
of cores efficiently. Transactional Memory (TM) is a
promising technique to ease the burden on the
programmer, but only recently has become available on
commercial hardware in the new Blue Gene/Q system and
hence the real benefit for realistic applications has
not been studied yet. This paper presents the first
performance results of TM embedded into OpenMP on a
prototype system of BG/Q and characterizes code
properties that will likely lead to benefits when
augmented with TM primitives. We first study the
influence of thread count, environment variables and
memory layout on TM performance and identify code
properties that will yield performance gains with TM.
Second, we evaluate the combination of OpenMP with
multiple synchronization primitives on top of MPI to
determine suitable task to thread ratios per node.
Finally, we condense our findings into a set of best
practices. These are applied to a Monte Carlo Benchmark
and a Smoothed Particle Hydrodynamics method. In both
cases an optimized TM version, executed with 64 threads
on one node, outperforms a simple TM implementation.
MCB with optimized TM yields a speedup of 27.45 over
baseline.",
acknowledgement = ack-nhfb,
articleno = "90",
}
@InProceedings{Grigori:2012:PTL,
author = "Laura Grigori and Radek Stompor and Mikolaj
Szydlarski",
title = "A parallel two-level preconditioner for cosmic
microwave background map-making",
crossref = "Hollingsworth:2012:SPI",
pages = "91:1--91:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a084.pdf",
abstract = "Generalized least square problems with non-diagonal
weights arise frequently in an estimation of two
dimensional images from data of cosmological as well as
astro- or geo- physical observations. As the
observational data sets keep growing at Moore's rate,
with their volumes exceeding tens and hundreds billions
of samples, the need for fast and efficiently
parallelizable iterative solvers is generally
recognized. In this work we propose a new iterative
algorithm for solving generalized least square systems
with weights given by a block-diagonal matrix with
Toeplitz blocks. Such cases are physically well
motivated and correspond to measurement noise being
piece-wise stationary --- a common occurrence in many
actual observations. Our iterative algorithm is based
on the conjugate gradient method and includes a
parallel two-level preconditioner (2lvl-PCG)
constructed from a limited number of sparse vectors
estimated from the coefficients of the initial linear
system. Our prototypical application is the map-making
problem in the Cosmic Microwave Background data
analysis. We show experimentally that our parallel
implementation of 2lvl-PCG outperforms by a factor of
up to 6 the standard one-level PCG in terms of both the
convergence rate and the time to solution on up to 12,
228 cores of NERSC's Cray XE6 (Hopper) system
displaying nearly perfect strong and weak scaling
behavior in this regime.",
acknowledgement = ack-nhfb,
articleno = "91",
}
@InProceedings{Speck:2012:MST,
author = "R. Speck and D. Ruprecht and R. Krause and M. Emmett
and M. Minion and M. Winkel and P. Gibbon",
title = "A massively space-time parallel {$N$}-body solver",
crossref = "Hollingsworth:2012:SPI",
pages = "92:1--92:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a083.pdf",
abstract = "We present a novel space-time parallel version of the
Barnes--Hut tree code pepc using pfasst, the Parallel
Full Approximation Scheme in Space and Time. The naive
use of increasingly more processors for a fixed-size
N-body problem is prone to saturate as soon as the
number of unknowns per core becomes too small. To
overcome this intrinsic strong-scaling limit, we
introduce temporal parallelism on top of pepc's
existing hybrid MPI/PThreads spatial decomposition.
Here, we use pfasst which is based on a combination of
the iterations of the parallel-in-time algorithm
parareal with the sweeps of spectral deferred
correction (SDC) schemes. By combining these sweeps
with multiple space-time discretization levels, pfasst
relaxes the theoretical bound on parallel efficiency in
parareal. We present results from runs on up to 262,144
cores on the IBM Blue Gene/P installation JUGENE,
demonstrating that the space-time parallel code
provides speedup beyond the saturation of the purely
space-parallel approach.",
acknowledgement = ack-nhfb,
articleno = "92",
}
@InProceedings{Fujisawa:2012:HPG,
author = "Katsuki Fujisawa and Hitoshi Sato and Satoshi Matsuoka
and Toshio Endo and Makoto Yamashita and Maho Nakata",
title = "High-performance general solver for extremely
large-scale semidefinite programming problems",
crossref = "Hollingsworth:2012:SPI",
pages = "93:1--93:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a082.pdf",
abstract = "Semidefinite programming (SDP) is one of the most
important problems among optimization problems at
present. It is relevant to a wide range of fields such
as combinatorial optimization, structural optimization,
control theory, economics, quantum chemistry, sensor
network location and data mining. The capability to
solve extremely large-scale SDP problems will have a
significant effect on the current and future
applications of SDP. In 1995, Fujisawa et al. started
the SDPA(Semidefinite programming algorithm) Project
aimed at solving large-scale SDP problems with high
numerical stability and accuracy. SDPA is one of the
main codes to solve general SDPs. SDPARA is a parallel
version of SDPA on multiple processors with distributed
memory, and it replaces two major bottleneck parts (the
generation of the Schur complement matrix and its
Cholesky factorization) of SDPA by their parallel
implementation. In particular, it has been successfully
applied to combinatorial optimization and truss
topology optimization. The new version of SDPARA
(7.5.0-G) on a large-scale supercomputer called TSUBAME
2.0 at the Tokyo Institute of Technology has
successfully been used to solve the largest SDP problem
(which has over 1.48 million constraints), and created
a new world record. Our implementation has also
achieved 533 TFlops in double precision for large-scale
Cholesky factorization using 2,720 CPUs and 4,080
GPUs.",
acknowledgement = ack-nhfb,
articleno = "93",
}
@InProceedings{VanderWijngaart:2012:EBN,
author = "Rob F. {Van der Wijngaart} and Srinivas Sridharan and
Victor W. Lee",
title = "Extending the {BT NAS} parallel benchmark to exascale
computing",
crossref = "Hollingsworth:2012:SPI",
pages = "94:1--94:9",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a094.pdf",
abstract = "The NAS Parallel Benchmarks (NPB) are a well-known
suite of benchmarks that proxy scientific computing
applications. They specify several problem sizes that
represent how such applications may run on different
sizes of HPC systems. However, even the largest problem
(class F) is still far too small to exercise properly a
petascale supercomputer. Our work shows how one may
scale the Block Tridiagonal (BT) NPB from today's
published size to petascale and exascale computing
systems. In this paper we discuss the pros and cons of
various ways of scaling. We discuss how scaling BT
would impact computation, memory access, and
communications, and highlight the expected bottleneck,
which turns out to be not memory or communication
bandwidth, but network latency. Two complementary ways
are presented to overcome latency obstacles. We also
describe a practical method to gather approximate
performance data for BT at exascale on actual hardware,
without requiring an exascale system.",
acknowledgement = ack-nhfb,
articleno = "94",
}
@InProceedings{Frasca:2012:NAG,
author = "Michael Frasca and Kamesh Madduri and Padma Raghavan",
title = "{NUMA}-aware graph mining techniques for performance
and energy efficiency",
crossref = "Hollingsworth:2012:SPI",
pages = "95:1--95:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a096.pdf",
abstract = "We investigate dynamic methods to improve the power
and performance profiles of large irregular
applications on modern multi-core systems. In this
context, we study a large sparse graph application,
Betweenness Centrality, and focus on memory behavior as
core count scales. We introduce new techniques to
efficiently map the computational demands onto
non-uniform memory architectures (NUMA). Our dynamic
design adapts to hardware topology and dramatically
improves both energy and performance. These gains are
more significant at higher core counts. We implement a
scheme for adaptive data layout, which reorganizes the
graph after observing parallel access patterns, and a
dynamic task scheduler that encourages shared data
between neighboring cores. We measure performance and
energy consumption on a modern multi-core machine and
observe that mean execution time is reduced by 51.2\%
and energy is reduced by 52.4\%.",
acknowledgement = ack-nhfb,
articleno = "95",
}
@InProceedings{Williams:2012:OGM,
author = "Samuel Williams and Dhiraj D. Kalamkar and Amik Singh
and Anand M. Deshpande and Brian {Van Straalen} and
Mikhail Smelyanskiy and Ann Almgren and Pradeep Dubey
and John Shalf and Leonid Oliker",
title = "Optimization of geometric multigrid for emerging
multi- and manycore processors",
crossref = "Hollingsworth:2012:SPI",
pages = "96:1--96:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a095.pdf",
abstract = "Multigrid methods are widely used to accelerate the
convergence of iterative solvers for linear systems
used in a number of different application areas. In
this paper, we explore optimization techniques for
geometric multigrid on existing and emerging multicore
systems including the Opteron-based Cray XE6,
Intel\reg{} Xeon\reg{} E5-2670 and X5550
processor-based InfiniBand clusters, as well as the new
Intel\reg{} Xeon PhiTM coprocessor (Knights Corner).
Our work examines a variety of novel techniques
including communication-aggregation, threaded
wavefront-based DRAM communication-avoiding, dynamic
threading decisions, SIMDization, and fusion of
operators. We quantify performance through each phase
of the V-cycle for both single-node and
distributed-memory experiments and provide detailed
analysis for each class of optimization. Results show
our optimizations yield significant speedups across a
variety of subdomain sizes while simultaneously
demonstrating the potential of multi- and manycore
processors to dramatically accelerate single-node
performance. However, our analysis also indicates that
improvements in networks and communication will be
essential to reap the potential of manycore processors
in large-scale multigrid calculations.",
acknowledgement = ack-nhfb,
articleno = "96",
}
@InProceedings{Bhatele:2012:MAC,
author = "Abhinav Bhatele and Todd Gamblin and Steven H. Langer
and Peer-Timo Bremer and Erik W. Draeger and Bernd
Hamann and Katherine E. Isaacs and Aaditya G. Landge
and Joshua A. Levine and Valerio Pascucci and Martin
Schulz and Charles H. Still",
title = "Mapping applications with collectives over
sub-communicators on torus networks",
crossref = "Hollingsworth:2012:SPI",
pages = "97:1--97:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a027.pdf",
abstract = "The placement of tasks in a parallel application on
specific nodes of a supercomputer can significantly
impact performance. Traditionally, this task mapping
has focused on reducing the distance between
communicating tasks on the physical network. This
minimizes the number of hops that point-to-point
messages travel and thus reduces link sharing between
messages and contention. However, for applications that
use collectives over sub-communicators, this heuristic
may not be optimal. Many collectives can benefit from
an increase in bandwidth even at the cost of an
increase in hop count, especially when sending large
messages. For example, placing communicating tasks in a
cube configuration rather than a plane or a line on a
torus network increases the number of possible paths
messages might take. This increases the available
bandwidth which can lead to significant performance
gains. We have developed Rubik, a tool that provides a
simple and intuitive interface to create a wide variety
of mappings for structured communication patterns.
Rubik supports a number of elementary operations such
as splits, tilts, or shifts, that can be combined into
a large number of unique patterns. Each operation can
be applied to disjoint groups of processes involved in
collectives to increase the effective bandwidth. We
demonstrate the use of Rubik for improving performance
of two parallel codes, pF3D and Qbox, which use
collectives over sub-communicators.",
acknowledgement = ack-nhfb,
articleno = "97",
}
@InProceedings{Hoefler:2012:OPC,
author = "Torsten Hoefler and Timo Schneider",
title = "Optimization principles for collective neighborhood
communications",
crossref = "Hollingsworth:2012:SPI",
pages = "98:1--98:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a028.pdf",
abstract = "Many scientific applications operate in a
bulk-synchronous mode of iterative communication and
computation steps. Even though the communication steps
happen at the same logical time, important patterns
such as stencil computations cannot be expressed as
collective communications in MPI. We demonstrate how
neighborhood collective operations allow to specify
arbitrary collective communication relations during
run-time and enable optimizations similar to
traditional collective calls. We show a number of
optimization opportunities and algorithms for different
communication scenarios. We also show how users can
assert constraints that provide additional optimization
opportunities in a portable way. We demonstrate the
utility of all described optimizations in a highly
optimized implementation of neighborhood collective
operations. Our communication and protocol
optimizations result in a performance improvement of up
to a factor of two for small stencil communications. We
found that, for some patterns, our optimization
heuristics automatically generate communication
schedules that are comparable to hand-tuned
collectives. With those optimizations in place, we are
able to accelerate arbitrary collective communication
patterns, such as regular and irregular stencils with
optimization methods for collective communications. We
expect that our methods will influence the design of
future MPI libraries and provide a significant
performance benefit on large-scale systems.",
acknowledgement = ack-nhfb,
articleno = "98",
}
@InProceedings{Cui:2012:OOB,
author = "Zheng Cui and Lei Xia and Patrick G. Bridges and Peter
A. Dinda and John R. Lange",
title = "Optimizing overlay-based virtual networking through
optimistic interrupts and cut-through forwarding",
crossref = "Hollingsworth:2012:SPI",
pages = "99:1--99:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a029.pdf",
abstract = "Overlay-based virtual networking provides a powerful
model for realizing virtual distributed and parallel
computing systems with strong isolation, portability,
and recoverability properties. However, in extremely
high throughput and low latency networks, such overlays
can suffer from bandwidth and latency limitations,
which is of particular concern if we want to apply the
model in HPC environments. Through careful study of an
existing very high performance overlay-based virtual
network system, we have identified two core issues
limiting performance: delayed and/or excessive virtual
interrupt delivery into guests, and copies between host
and guest data buffers done during encapsulation. We
respond with two novel optimizations: optimistic,
timer-free virtual interrupt injection, and zero-copy
cut-through data forwarding. These optimizations
improve the latency and bandwidth of the overlay
network on 10 Gbps interconnects, resulting in
near-native performance for a wide range of
microbenchmarks and MPI application benchmarks.",
acknowledgement = ack-nhfb,
articleno = "99",
}
@InProceedings{Georganas:2012:CAO,
author = "Evangelos Georganas and Jorge
Gonz{\'a}lez-Dom{\'\i}nguez and Edgar Solomonik and
Yili Zheng and Juan Touri{\~n}o and Katherine Yelick",
title = "Communication avoiding and overlapping for numerical
linear algebra",
crossref = "Hollingsworth:2012:SPI",
pages = "100:1--100:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a061.pdf",
abstract = "To efficiently scale dense linear algebra problems to
future exascale systems, communication cost must be
avoided or overlapped. Communication-avoiding 2.5D
algorithms improve scalability by reducing
inter-processor data transfer volume at the cost of
extra memory usage. Communication overlap attempts to
hide messaging latency by pipelining messages and
overlapping with computational work. We study the
interaction and compatibility of these two techniques
for two matrix multiplication algorithms (Cannon and
SUMMA), triangular solve, and Cholesky factorization.
For each algorithm, we construct a detailed performance
model that considers both critical path dependencies
and idle time. We give novel implementations of 2.5D
algorithms with overlap for each of these problems. Our
software employs UPC, a partitioned global address
space (PGAS) language that provides fast one-sided
communication. We show communication avoidance and
overlap provide a cumulative benefit as core counts
scale, including results using over 24K cores of a Cray
XE6 system.",
acknowledgement = ack-nhfb,
articleno = "100",
}
@InProceedings{Lipshitz:2012:CAP,
author = "Benjamin Lipshitz and Grey Ballard and James Demmel
and Oded Schwartz",
title = "Communication-avoiding parallel {Strassen}:
implementation and performance",
crossref = "Hollingsworth:2012:SPI",
pages = "101:1--101:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a063.pdf",
abstract = "Matrix multiplication is a fundamental kernel of many
high performance and scientific computing applications.
Most parallel implementations use classical O ( n$^3$ )
matrix multiplication, even though there exist
algorithms with lower arithmetic complexity. We
recently presented a new Communication-Avoiding
Parallel Strassen algorithm (CAPS), based on Strassen's
fast matrix multiplication, that minimizes
communication (SPAA '12). It communicates
asymptotically less than all classical and all previous
Strassen-based algorithms, and it attains theoretical
lower bounds. In this paper we show that CAPS is also
faster in practice. We benchmark and compare its
performance to previous algorithms on Hopper (Cray
XE6), Intrepid (IBM BG/P), and Franklin (Cray XT4). We
demonstrate significant speedups over previous
algorithms both for large matrices and for small
matrices on large numbers of processors. We model and
analyze the performance of CAPS and predict its
performance on future exascale platforms.",
acknowledgement = ack-nhfb,
articleno = "101",
}
@InProceedings{Avron:2012:MDM,
author = "Haim Avron and Anshul Gupta",
title = "Managing data-movement for effective shared-memory
parallelization of out-of-core sparse solvers",
crossref = "Hollingsworth:2012:SPI",
pages = "102:1--102:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a062.pdf",
abstract = "Direct methods for solving sparse linear systems are
robust and typically exhibit good performance, but
often require large amounts of memory due to fill-in.
Many industrial applications use out-of-core techniques
to mitigate this problem. However, parallelizing sparse
out-of-core solvers poses some unique challenges
because accessing secondary storage introduces
serialization and I/O overhead. We analyze the
data-movement costs and memory versus parallelism
trade-offs in a shared-memory parallel out-of-core
linear solver for sparse symmetric systems. We propose
an algorithm that uses a novel memory management scheme
and adaptive task parallelism to reduce the
data-movement costs. We present experiments to show
that our solver is faster than existing out-of-core
sparse solvers on a single core, and is more scalable
than the only other known shared-memory parallel
out-of-core solver. This work is also directly
applicable at the node level in a distributed-memory
parallel scenario.",
acknowledgement = ack-nhfb,
articleno = "102",
}
@InProceedings{Faanes:2012:CCS,
author = "Greg Faanes and Abdulla Bataineh and Duncan Roweth and
Tom Court and Edwin Froese and Bob Alverson and Tim
Johnson and Joe Kopnick and Mike Higgins and James
Reinhard",
title = "{Cray Cascade}: a scalable {HPC} system based on a
{Dragonfly} network",
crossref = "Hollingsworth:2012:SPI",
pages = "103:1--103:9",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a079.pdf",
abstract = "Higher global bandwidth requirement for many
applications and lower network cost have motivated the
use of the Dragonfly network topology for high
performance computing systems. In this paper we present
the architecture of the Cray Cascade system, a
distributed memory system based on the Dragonfly [1]
network topology. We describe the structure of the
system, its Dragonfly network and the routing
algorithms. We describe a set of advanced features
supporting both mainstream high performance computing
applications and emerging global address space
programming models. We present a combination of
performance results from prototype systems and
simulation data for large systems. We demonstrate the
value of the Dragonfly topology and the benefits
obtained through extensive use of adaptive routing.",
acknowledgement = ack-nhfb,
articleno = "103",
}
@InProceedings{Makino:2012:GAG,
author = "Junichiro Makino and Hiroshi Daisaka",
title = "{GRAPE-8}: an accelerator for gravitational {$N$}-body
simulation with {20.5Gflops/W} performance",
crossref = "Hollingsworth:2012:SPI",
pages = "104:1--104:10",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a081.pdf",
abstract = "In this paper, we describe the design and performance
of GRAPE-8 accelerator processor for gravitational N
-body simulations. It is designed to evaluate
gravitational interaction with cutoff between
particles. The cutoff function is useful for schemes
like TreePM or Particle-Particle Particle-Tree, in
which gravitational force is divided to short-range and
long-range components. A single GRAPE-8 processor chip
integrates 48 pipeline processors. The effective number
of floating-point operations per interaction is around
40. Thus the peak performance of a single GRAPE-8
processor chip is 480 Gflops. A GRAPE-8 processor card
houses two GRAPE-8 chips and one FPGA chip for
PCI-Express interface. The total power consumption of
the board is 46W. Thus, theoretical peak performance
per watt is 20.5 Gflops/W. The effective performance of
the total system, including the host computer, is
around 5Gflops/W. This is more than a factor of two
higher than the highest number in the current Green500
list.",
acknowledgement = ack-nhfb,
articleno = "104",
}
@InProceedings{Thorson:2012:SUF,
author = "Greg Thorson and Michael Woodacre",
title = "{SGI UV2}: a fused computation and data analysis
machine",
crossref = "Hollingsworth:2012:SPI",
pages = "105:1--105:9",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
URL = "http://conferences.computer.org/sc/2012/papers/1000a080.pdf",
abstract = "UV2 is SGI's second generation data fusion system. UV2
was designed to meet the latest challenges facing users
in computation and data analysis. Its unique ability to
perform both functions on a single platform enables
efficient, easy to manage workflows. This platform has
a hybrid infrastructure, leveraging the latest
Intel\reg{} EP processors providing industry leading
computational power. Due to its high bandwidth,
extremely low latency NUMALink\reg{}6 (NL6)
interconnect, plus vectorized synchronization and data
movement, UV2 provides industry leading data intensive
capability. It supports a single operating system (OS)
image up to 64TB and 4K threads. Multiple OS images can
be deployed on a single NL6 fabric, which has a single
flat address space up to 8PB and 256K threads. These
capabilities allow for extreme performance on a broad
range of programming models and languages including
OpenMP[1], MPI, UPC[2], CAF[3] and SHMEM. The
architecture, implementation and performance of UV2 are
detailed.",
acknowledgement = ack-nhfb,
articleno = "105",
}
@Proceedings{Hollingsworth:2012:SPI,
editor = "Jeffrey Hollingsworth",
booktitle = "{SC '12: Proceedings of the International Conference
on High Performance Computing, Networking, Storage and
Analysis, Salt Lake Convention Center, Salt Lake City,
UT, USA, November 10--16, 2012}",
title = "{SC '12: Proceedings of the International Conference
on High Performance Computing, Networking, Storage and
Analysis, Salt Lake Convention Center, Salt Lake City,
UT, USA, November 10--16, 2012}",
publisher = pub-IEEE,
address = pub-IEEE:adr,
year = "2012",
ISBN = "1-4673-0804-8",
ISBN-13 = "978-1-4673-0804-5",
bibdate = "Thu Nov 15 07:35:55 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
acknowledgement = ack-nhfb,
}