%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.02", %%% date = "13 May 2011", %%% time = "18:06:48 MDT", %%% filename = "supercomputing2003.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "46413 2188 12588 123939", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "BibTeX, bibliography, SC2003, Supercomputing %%% 2003", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a complete bibliography of papers %%% published in the proceedings of %%% Supercomputing '2003. %%% %%% The conference World-Wide Web site is %%% %%% http://www.sc-conference.org/sc2003/ %%% %%% The organizers of this conference series %%% maintain a World-Wide Web site at %%% %%% http://www.supercomp.org/ %%% %%% where pointers to Web pages for the %%% conferences from 1988 to date may be found. %%% %%% At version 1.02, the year coverage looked %%% like this: %%% %%% 2003 ( 61) %%% %%% InProceedings: 60 %%% Proceedings: 1 %%% %%% Total entries: 61 %%% %%% In this bibliography, entries are sorted in %%% order of PDF file numbers. %%% %%% The on-line electronic proceedings do not %%% contain sequential page numbers, although %%% there is an ISBN assigned for the %%% proceedings. A pagecount field is given with %%% each entry, extracted from the PDF file: some %%% of the articles lack page numbers altogether, %%% others number pages 1, 2, 3, ... %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Publishers and their addresses: @String{pub-ACM = "ACM Press"} @String{pub-ACM:adr = "New York, NY 10036, USA"} @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} %%% ==================================================================== %%% Bibliography entries. @InProceedings{Tang:2003:EDL, author = "Hong Tang and Tao Yang", title = "An Efficient Data Location Protocol for Self-organizing Storage Clusters", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#0; http://www.sc-conference.org/sc2003/paperpdfs/pap299.pdf", abstract = "Component additions and failures are common for large-scale storage clusters in production environments. To improve availability and manageability, we investigate and compare data location schemes for a large self-organizing storage cluster that can quickly adapt to the additions or departures of storage nodes. We further present an efficient location scheme that differentiates between small and large file blocks for reduced management overhead compared to uniform strategies. In our protocol, small blocks, which are typically in large quantities, are placed through consistent hashing. Large blocks, much fewer in practice, are placed through a usage-based policy, and their locations are tracked by Bloom filters. The proposed scheme results in improved storage utilization even with non-uniform cluster nodes. To achieve high scalability and fault resilience, this protocol is fully distributed, relies only on soft states, and supports data replication. We demonstrate the effectiveness and efficiency of this protocol through trace-driven simulation.", acknowledgement = ack-nhfb, } @InProceedings{Wu:2003:HHS, author = "Changxun Wu and Randal Burns", title = "Handling Heterogeneity in Shared-Disk File Systems", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#1; http://www.sc-conference.org/sc2003/paperpdfs/pap132.pdf", abstract = "We develop and evaluate a system for load management in shared-disk file systems built on clusters of heterogeneous computers. The system generalizes load balancing and server provisioning. It balances file metadata workload by moving file sets among cluster server nodes. It also responds to changing server resources that arise from failure and recovery and dynamically adding or removing servers. The system is adaptive and self-managing. It operates without any a-priori knowledge of workload properties or the capabilities of the servers. Rather, it continuously tunes load placement using a technique called adaptive, non-uniform (ANU) randomization. ANU randomization realizes the scalability and metadata reduction benefits of hash-based, randomized placement techniques. It also avoids hashing's drawbacks: load skew, inability to cope with heterogeneity, and lack of tunability. Simulation results show that our load-management algorithm performs comparably to a prescient algorithm.", acknowledgement = ack-nhfb, } @InProceedings{Nagaraja:2003:QIA, author = "Kiran Nagaraja and Neeraj Krishnan and Ricardo Bianchini and Richard P. Martin and Thu D. Nguyen", title = "Quantifying and Improving the Availability of High-Performance Cluster-Based {Internet} Services", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#2; http://www.sc-conference.org/sc2003/paperpdfs/pap213.pdf", abstract = "Cluster-based servers can substantially increase performance when nodes cooperate to globally manage resources. However, in this paper we show that cooperation results in a substantial availability loss, in the absence of high-availability mechanisms. Specifically, we show that a sophisticated cluster-based Web server, which gains a factor of 3 in performance through cooperation, increases service unavailability by a factor of 10 over a non-cooperative version. We then show how to augment this Web server with software components embodying a small set of high-availability techniques to regain the lost availability. Among other interesting observations, we show that the application of multiple high-availability techniques, each implemented independently in its own subsystem, can lead to inconsistent recovery actions. We also show that a novel technique called Fault Model Enforcement can be used to resolve such inconsistencies. Augmenting the server with these techniques led to a final expected availability of close to 99.99\%.", acknowledgement = ack-nhfb, } @InProceedings{Roth:2003:MSB, author = "Philip C. Roth and Dorian C. Arnold and Barton P. Miller", title = "{MRNet}: {A} Software-Based Multicast\slash Reduction Network for Scalable Tools", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#0; http://www.sc-conference.org/sc2003/paperpdfs/pap192.pdf", abstract = "We present MRNet, a software-based multicast/reduction network for building scalable performance and system administration tools. MRNet supports multiple simultaneous, asynchronous collective communication operations. MRNet is flexible, allowing tool builders to tailor its process network topology to suit their tool's requirements and the underlying system's capabilities. MRNet is extensible, allowing tool builders to incorporate custom data reductions to augment its collection of built-in reductions. We evaluated MRNet in a simple test tool and also integrated into an existing, real-world performance tool with up to 512 tool back-ends. In the real-world tool, we used MRNet not only for multicast and simple data reductions but also with custom histogram and clock skew detection reductions. In our experiments, the MRNet-based tools showed significantly better performance than the tools without MRNet for average message latency and throughput, overall tool start-up latency, and performance data processing throughput.", acknowledgement = ack-nhfb, keywords = "aggregation; scalability; tools; multicast; reduction", } @InProceedings{Miller:2003:TDP, author = "Barton Miller and Ana Cort{\'e}s and Miquel Senar and Miron Livny", title = "The {Tool Daemon Protocol}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#1; http://www.sc-conference.org/sc2003/paperpdfs/pap187.pdf", abstract = "Run-time tools are crucial to program development. In our desktop computer environments, we take for granted the availability of tools for operations such as debugging, profiling, tracing, checkpointing, and visualization. When programs move into distributed or Grid environments, it is difficult to find such tools. This difficulty is caused by the complex interactions necessary between application program, operating system and layers of job scheduling and process management software. As a result, each run-time tool must be individually ported to run under a particular job management system; for $m$ tools and $n$ environments, the problem becomes an $m \times n$ effort, rather than the hoped-for $m + n$ effort. Variations in underlying operating systems can make this problem even worse. The consequence of this situation is a paucity of tools in distributed and Grid computing environments. In response to the problem, we have analyzed a variety of job scheduling environments and run-time tools to better understand their interactions. From this analysis, we isolated what we believe are the essential interactions between the runtime tool, job scheduler and resource manager, and application program. We are proposing a standard interface, called the Tool D{\ae}mon Protocol (TDP) that codifies these interactions and provides the necessary communication functions. We have implemented a pilot TDP library and experimented with Parador, a prototype using the Paradyn Parallel Performance tools profiling jobs running under the Condor batch-scheduling environment.", acknowledgement = ack-nhfb, } @InProceedings{Yang:2003:CSU, author = "Lingyun Yang and Jennifer M. Schopf and Ian Foster", title = "Conservative Scheduling: Using Predicted Variance to Improve Scheduling Decisions in Dynamic Environments", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#2; http://www.sc-conference.org/sc2003/paperpdfs/pap236.pdf", abstract = "In heterogeneous and dynamic environments, efficient execution of parallel computations can require mappings of tasks to processors whose performance is both irregular (because of heterogeneity) and time-varying (because of dynamicity). While adaptive domain decomposition techniques have been used to address heterogeneous resource capabilities, temporal variations in those capabilities have seldom been considered. We propose a conservative scheduling policy that uses information about expected future variance in resource capabilities to produce more efficient data mapping decisions. We first present techniques, based on time series predictors that we developed in previous work, for predicting CPU load at some future time point, average CPU load for some future time interval, and variation of CPU load over some future time interval. We then present a family of stochastic scheduling algorithms that exploit such predictions of future availability and variability when making data mapping decisions. Finally, we describe experiments in which we apply our techniques to an astrophysics application. The results of these experiments demonstrate that conservative scheduling can produce execution times that are both significantly faster and less variable than other techniques.", acknowledgement = ack-nhfb, } @InProceedings{Ding:2003:CAI, author = "Yonghua Ding and Zhiyuan Li", title = "A Compiler Analysis of Interprocedural Data Communication", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#0; http://www.sc-conference.org/sc2003/paperpdfs/pap137.pdf", abstract = "This paper presents a compiler analysis for data communication for the purpose of transforming ordinary programs into ones that run on distributed systems. Such transformations have been used for process migration and computation offloading to improve the performance of mobile computing devices. In a client-server distributed environment, the efficiency of an application can be improved by careful partitioning of tasks between the server and the client. Optimal task partitioning depends on the tradeoff between the computation workload and the communication cost. Our compiler analysis, assisted by a minimum set of user assertions, estimates the amount of data communication between procedures. The paper also presents experimental results based on an implementation in the GCC compiler. The static estimates for several multimedia programs are compared against dynamic measurement performed using Shade, a SUN Microsystem's instruction-level simulator. The results show a high precision of the static analysis for most pairs of the procedures.", acknowledgement = ack-nhfb, } @InProceedings{Chauhan:2003:ATD, author = "Arun Chauhan and Cheryl McCosh and Ken Kennedy and Richard Hanson", title = "Automatic Type-Driven Library Generation for Telescoping Languages", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#1; http://www.sc-conference.org/sc2003/paperpdfs/pap296.pdf", abstract = "Telescoping languages is a strategy to automatically generate highly-optimized domain-specific libraries. The key idea is to create specialized variants of library procedures through extensive offline processing. This paper describes a telescoping system, called ARGen, which generates high-performance Fortran or C libraries from prototype Matlab code for the linear algebra library, ARPACK. ARGen uses variable types to guide procedure specializations on possible calling contexts.\par ARGen needs to infer Matlab types in order to speculate on the possible variants of library procedures, as well as to generate code. This paper shows that our type-inference system is powerful enough to generate all the variants needed for ARPACK automatically from the Matlab development code. The ideas demonstrated here provide a basis for building a more general telescoping system for Matlab.", acknowledgement = ack-nhfb, } @InProceedings{Du:2003:CSE, author = "Wei Du and Renato Ferreira and Gagan Agrawal", title = "Compiler Support for Exploiting Coarse-Grained Pipelined Parallelism", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#2; http://www.sc-conference.org/sc2003/paperpdfs/pap133.pdf", abstract = "The emergence of grid and a new class of data-driven applications is making a new form of parallelism desirable, which we refer to as coarse-grained pipelined parallelism. This paper reports on a compilation system developed to exploit this form of parallelism. We use a dialect of Java that exposes both pipelined and data parallelism to the compiler. Our compiler is responsible for selecting a set of candidate filter boundaries, determining the volume of communication required if a particular boundary is chosen, performing the decomposition, and generating code. We have developed a one-pass algorithm for determining the required communication between consecutive filters. We have developed a cost model for estimating the execution time for a given decomposition, and a dynamic programming algorithm for performing the decomposition. Detailed evaluation of our current compiler using four data-driven applications demonstrate the feasibility of our approach.", acknowledgement = ack-nhfb, } @InProceedings{Lu:2003:SRC, author = "Dong Lu and Peter August Dinda", title = "Synthesizing Realistic Computational Grids", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#0; http://www.sc-conference.org/sc2003/paperpdfs/pap180.pdf", abstract = "Realistic workloads are essential in evaluating middleware for computational grids. One important component is the raw grid itself: a network topology graph annotated with the hardware and software available on each node and link. This paper defines our requirements for grid generation and presents GridG, our extensible generator. We describe GridG in two steps: topology generation and annotation. For topology generation, we have both model and mechanism. We extend Tiers, an existing tool from the networking community, to produce graphs that obey recently discovered power laws of Internet topology. We also contribute to network topology theory by illustrating a contradiction between two laws and proposing a new version of one of them. For annotation, GridG captures intra- and inter-host correlations between attributes using conditional probability rules. We construct a set of rules, including one based on empirical evidence of OS concentration in subnets, that produce sensible host annotations.", acknowledgement = ack-nhfb, } @InProceedings{Liu:2003:TBL, author = "Xin Liu and Andrew A. Chien", title = "Traffic-based Load Balance for Scalable Network Emulation", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#1; http://www.sc-conference.org/sc2003/paperpdfs/pap260.pdf", abstract = "Load balance is critical to achieving scalability for large network emulation studies, which are of compelling interest for emerging Grid, Peer to Peer, and other distributed applications and middleware. Achieving load balance in emulation is difficult because of irregular network structure and unpredictable network traffic. We formulate load balance as a graph partitioning problem and apply classical graph partitioning algorithms to it. The primary challenge in this approach is how to extract useful information from the network emulation and present it to the graph partitioning algorithms in a way that reflects the load balance requirement in the original emulation problem. Using a large-scale network emulation system called MaSSF, we explore three approaches for partitioning, based on purely static topology information (TOP), combining topology and application placement information (PLACE), and combining topology and application profile data (PROFILE). These studies show that exploiting static topology and application placement information can achieve reasonable load balance, but a profile-based approach further improves load balance for even large scale network emulation. In our experiments, PROFILE improves load balance by 50\% to 66\% and emulation time is reduced up to 50\% compared to purely static topology-based approaches.", acknowledgement = ack-nhfb, } @InProceedings{Butt:2003:SOF, author = "Ali Raza Butt and Rongmei Zhang and Y. Charlie Hu", title = "A Self-Organizing Flock of {Condors}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#2; http://www.sc-conference.org/sc2003/paperpdfs/pap265.pdf", abstract = "Condor provides high throughput computing by leveraging idle cycles on off-the-shelf desktop machines. It also supports flocking, a mechanism for sharing resources among Condor pools. Since Condor pools distributed over a wide area can have dynamically changing availability and sharing preferences, the current flocking mechanism based on static configurations can limit the potential of sharing resources across Condor pools. This paper presents a technique for resource discovery in distributed Condor pools using peer-to-peer mechanisms that are self-organizing, fault-tolerant, scalable, and locality-aware. Locality-awareness guarantees that applications are not shipped across long distances when nearby resources are available. Measurements using a synthetic job trace show that self-organized flocking reduces the maximum job wait time in queue for a heavily loaded pool by a factor of 10 compared to without flocking. Simulations of 1000 Condor pools are also presented and the results confirm that our technique discovers and utilizes nearby resources in the physical network.", acknowledgement = ack-nhfb, } @InProceedings{Olson:2003:EEU, author = "Ryan M. Olson and Michael W. Schmidt and Mark S. Gordon and Alistair P. Rendell", title = "Enabling the Efficient Use of {SMP} Clusters: The {GAMESS\slash DDI} Model", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#0; http://www.sc-conference.org/sc2003/paperpdfs/pap263.pdf", abstract = "An important advance in cluster computing is the evolution from single processor clusters to multiprocessor SMP clusters. Due to the increased complexity in the memory model on SMP clusters, new approaches are needed for applications that make use of distributed-memory paradigms. This paper presents new communications software developments that are designed to take advantage of SMP cluster hardware. Although the specific focus is on the central field of computational chemistry and materials science, as embodied in the popular electronic structure package GAMESS (General Atomic and Molecular Electronic Structure System), the impact of these new developments will be far broader in scope. Following a summary of the essential features of the distributed data interface (DDI) in the current implementation of GAMESS, the new developments for SMP clusters are described. The advantages of these new features are illustrated using timing benchmarks on several hardware platforms, using a typical computational chemistry application.", acknowledgement = ack-nhfb, } @InProceedings{Ding:2003:RVB, author = "Jin Ding and Jian Huang and Micah Beck and Shaotao Liu and Terry Moore and Stephen Soltesz", title = "Remote Visualization by Browsing Image Based Databases with Logistical Networking", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#1; http://www.sc-conference.org/sc2003/paperpdfs/pap245.pdf", abstract = "The need to provide remote visualization of large datasets with adequate levels of quality and interactivity has become a major impediment to distributed collaboration in Computational Science. Although Image Based Rendering (IBR) techniques based on plenoptic functions have some important advantages over other approaches to this problem, they suffer from an inability to deal with issues of network latency and server load, due to the large size of the IBR databases they generate. Consequently, IBR techniques have been left largely unexplored for this purpose. In this paper we describe strategies for addressing these obstacles using Logistical Networking (LoN), which is a new and highly scalable approach to deploying storage as a shared communication resource. Leveraging LoN technology and infrastructure, we developed a remote visualization system based on concepts of light field rendering, an IBR method using a 4-D plenoptic function. Our system extends existing work on light fields by employing a modified method of parameterization and data organization that supports more efficient prefetching, caching and loss-less compression. Using this approach, we have been able to interactively browse multi-gigabyte, high-resolution light field databases across the wide area network at 30 frames per second.", acknowledgement = ack-nhfb, } @InProceedings{Ma:2003:VVL, author = "Kwan-Liu Ma and Aleksander Stompel and Jacobo Bielak and Omar Ghattas and Eui Joong Kim", title = "Visualizing Very Large-Scale Earthquake Simulations", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#2; http://www.sc-conference.org/sc2003/paperpdfs/pap287.pdf", abstract = "This paper presents a parallel adaptive rendering algorithm and its performance for visualizing time-varying unstructured volume data generated from large-scale earthquake simulations. The objective is to visualize 3D seismic wave propagation generated from a 0.5 Hz simulation of the Northridge earthquake, which is the highest resolution volume visualization of an earthquake simulation performed to date. This scalable high-fidelity visualization solution we provide to the scientists allows them to explore in the temporal, spatial, and visualization domain of their data at high resolution. This new high resolution explorability, likely not presently available to most computational science groups, will help lead to many new insights. The performance study we have conducted on a massively parallel computer operated at the Pittsburgh Supercomputing Center helps direct our design of a simulation-time visualization strategy for the higher-resolution, 1Hz and 2 Hz, simulations.", acknowledgement = ack-nhfb, keywords = "earthquake modeling; high-performance computing; massively parallel supercomputing; scientific visualization; parallel rendering; time-varying data; unstructured grids; volume rendering; wave propagation", } @InProceedings{Liu:2003:PCM, author = "Jiuxing Liu and Balasubramanian Chandrasekaran and Jiesheng Wu and Weihang Jiang and Sushmitha Kini and Weikuan Yu and Darius Buntinas and Pete Wyckoff and D. K. Panda", title = "Performance Comparison of {MPI} Implementations over {InfiniBand}, {Myrinet} and {Quadrics}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#0; http://www.sc-conference.org/sc2003/paperpdfs/pap310.pdf", abstract = "In this paper, we present a comprehensive performance comparison of MPI implementations over InfiniBand, Myrinet and Quadrics. Our performance evaluation consists of two major parts. The first part consists of a set of MPI level micro-benchmarks that characterize different aspects of MPI implementations. The second part of the performance evaluation consists of application level benchmarks. We have used the NAS Parallel Benchmarks and the sweep3D benchmark. We not only present the overall performance results, but also relate application communication characteristics to the information we acquired from the micro-benchmarks. Our results show that the three MPI implementations all have their advantages and disadvantages. For our 8-node cluster, InfiniBand can offer significant performance improvements for a number of applications compared with Myrinet and Quadrics when using the PCI-X bus. Even with just the PCI bus, InfiniBand can still perform better if the applications are bandwidth-bound.", acknowledgement = ack-nhfb, } @InProceedings{Bouteiller:2003:MVF, author = "Aurelien Bouteiller and Franck Cappello and Thomas Herault and Geraud Krawezik and Pierre Lemarinier and Frederic Magniette", title = "{MPICH-V2}: a Fault Tolerant {MPI} for Volatile Nodes based on Pessimistic Sender Based Message Logging", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#1; http://www.sc-conference.org/sc2003/paperpdfs/pap209.pdf", abstract = "Execution of MPI applications on clusters and Grid deployments suffering from node and network failures motivates the use of fault tolerant MPI implementations. We present MPICH-V2 (the second protocol of MPICHV project), an automatic fault tolerant MPI implementation using an innovative protocol that removes the most limiting factor of the pessimistic message logging approach: reliable logging of in transit messages. MPICH-V2 relies on uncoordinated checkpointing, sender based message logging and remote reliable logging of message logical clocks. This paper presents the architecture of MPICH-V2, its theoretical foundation and the performance of the implementation. We compare MPICH-V2 to MPICH-V1 and MPICH-P4 evaluating (a) its point-to-point performance, (b) the performance for the NAS benchmarks, (c) the application performance when many faults occur during the execution. Experimental results demonstrate that MPICH-V2 provides performance close to MPICH-P4 for applications using large messages while reducing dramatically the number of reliable nodes compared to MPICH-V1.", acknowledgement = ack-nhfb, } @InProceedings{Kleban:2003:HDI, author = "Stephen D. Kleban and Scott H. Clearwater", title = "Hierarchical Dynamics, Interarrival Times, and Performance", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#2; http://www.sc-conference.org/sc2003/paperpdfs/pap222.pdf", abstract = "We report on a model of the distribution of job submission interarrival times in supercomputers. Interarrival times are modeled as a consequence of a complicated set of decisions between users, the queuing algorithm, and other policies. This cascading hierarchy of decision-making processes leads to a particular kind of heavy-tailed distribution. Specifically, hierarchically constrained systems suggest that fatter tails are due to more levels coming into play in the overall decision-making process. The key contribution of this paper is that heavier tails resulting from more complex decision-making processes, that is more hierarchical levels, will lead to overall worse performance, even when the average interarrival time is the same. Finally, we offer some suggestions for how to overcome these issues and the tradeoffs involved.", acknowledgement = ack-nhfb, keywords = "hierarchy; relaxation process; interarrival; ASCI queueing; dynamics", } @InProceedings{Adams:2003:AAM, author = "Mark F. Adams and Harun H. Bayraktar and Tony M. Keaveny and Panayiotis Papadopoulos", title = "Applications of Algebraic Multigrid to Large-Scale Finite Element Analysis of Whole Bone Micro-Mechanics on the {IBM SP}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#0; http://www.sc-conference.org/sc2003/paperpdfs/pap211.pdf", abstract = "Accurate micro-finite element analyses of whole bones require the solution of large sets of algebraic equations. Multigrid has proven to be an effective approach to the design of highly scalable linear solvers for solid mechanics problems. We present some of the first applications of scalable linear solvers, on massively parallel computers, to whole vertebral body structural analysis. We analyze the performance of our algebraic multigrid (AMG) methods on problems with over 237 million degrees of freedom on IBM SP parallel computers. We demonstrate excellent parallel scalability, both in the algorithms and the implementations, and analyze the nodal performance of the important AMG kernels on the IBM Power3 and Power4 architectures.", acknowledgement = ack-nhfb, keywords = "multigrid; trabecular bone; human vertebral body; finite element method; massively parallel computing.", } @InProceedings{Wang:2003:PMS, author = "Kai Wang and Jun Zhang and Chi Shen", title = "Parallel Multilevel Sparse Approximate Inverse Preconditioners in Large Sparse Matrix Computations", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#1; http://www.sc-conference.org/sc2003/paperpdfs/pap109.pdf", abstract = "We investigate the use of the multistep successive preconditioning strategies (MSP) to construct a class of parallel multilevel sparse approximate inverse (SAI) preconditioners. We do not use independent set ordering, but a diagonal dominance based matrix permutation to build a multilevel structure. The purpose of introducing multilevel structure into SAI is to enhance the robustness of SAI for solving difficult problems. Forward and backward preconditioning iteration and two Schur complement preconditioning strategies are proposed to improve the performance and to reduce the storage cost of the multilevel preconditioners. One version of the parallel multilevel SAI preconditioner based on the MSP strategy is implemented. Numerical experiments for solving a few sparse matrices on a distributed memory parallel computer are reported.", acknowledgement = ack-nhfb, } @InProceedings{Qiang:2003:PPC, author = "Ji Qiang and Miguel A. Furman and Robert D. Ryne", title = "Parallel Particle-In-Cell Simulation of Colliding Beams in High Energy Accelerators", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#2; http://www.sc-conference.org/sc2003/paperpdfs/pap223.pdf", abstract = "In this paper we present a self-consistent simulation model of colliding beams in high energy accelerators. The model, which is based on a particle-in-cell method, uses a new developed shifted-Green function algorithm for the efficient calculation of the beam-beam interaction. The model uses transfer maps to treat the external focusing elements and a stochastic map to treat radiation damping and quantum excitation of the beams. In the parallel implementation we studied various strategies to deal with the particular nature of the colliding beam system --- a system in which there can be significant particle movement between beam-beam collisions. We chose a particle-field decomposition approach instead of the conventional domain decomposition or particle decomposition approach. The particle-field approach leads to good load balance, reduced communication cost, and shows the best scalability on an IBM SP3 among the three parallel implementations we studied. A performance test of the beam-beam model on a Cray T3E, IBM SP3, and a PC cluster is presented. As an application, we studied the effect of long-range collisions on antiproton lifetime in the Fermilab Tevatron.", acknowledgement = ack-nhfb, } @InProceedings{Dinda:2003:NQR, author = "Peter Dinda and Dong Lu", title = "Nondeterministic Queries in a Relational Grid Information Service", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#0; http://www.sc-conference.org/sc2003/paperpdfs/pap146.pdf", abstract = "A Grid Information Service (GIS) stores information about the resources of a distributed computing environment and answers questions about it. We are developing RGIS, a GIS system based on the relational data model. RGIS users can write SQL queries that search for complex compositions of resources that meet collective requirements. Executing these queries can be very expensive, however. In response, we introduce the nondeterministic query, an extension to the SELECT statement, which allows the user (and RGIS) to trade off between the query's running time and the number of results. The results are a random sample of the deterministic results, which we argue is sufficient and appropriate. Herein we describe RGIS, the nondeterministic query extension, and its implementation. Our evaluation shows that a meaningful tradeoff between query time and results returned is achievable, and that the tradeoff can be used to keep query time largely independent of query complexity.", acknowledgement = ack-nhfb, } @InProceedings{Kurc:2003:ORC, author = "Tahsin Kurc and Feng Lee and Gagan Agrawal and Umit Catalyurek and Renato Ferreira and Joel Saltz", title = "Optimizing Reduction Computations In a Distributed Environment", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#1; http://www.sc-conference.org/sc2003/paperpdfs/pap135.pdf", abstract = "We investigate runtime strategies for data-intensive applications that involve generalized reductions on large, distributed datasets. Our set of strategies includes replicated filter state, partitioned filter state, and hybrid options between these two extremes. We evaluate these strategies using emulators of three real applications, different query and output sizes, and a number of configurations. We consider execution in a homogeneous cluster and in a distributed environment where only a subset of nodes host the data. Our results show replicating the filter state scales well and outperforms other schemes, if sufficient memory is available and sufficient computation is involved to offset the cost of global merge step. In other cases, hybrid is usually the best. Moreover, in almost all cases, the performance of the hybrid strategy is quite close to the best strategy. Thus, we believe that hybrid is an attractive approach when the relative performance of different schemes cannot be predicted.", acknowledgement = ack-nhfb, } @InProceedings{Shan:2003:JSA, author = "Hongzhang Shan and Leonid Oliker and Rupak Biswas", title = "Job Superscheduler Architecture and Performance in Computational Grid Environments", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#2; http://www.sc-conference.org/sc2003/paperpdfs/pap267.pdf", abstract = "Computational grids hold great promise in utilizing geographically separated heterogeneous resources to solve large-scale complex scientific problems. However, a number of major technical hurdles, including distributed resource management and effective job scheduling, stand in the way of realizing these gains. In this paper, we propose a novel grid superscheduler architecture and three distributed job migration algorithms. We also model the critical interaction between the superscheduler and autonomous local schedulers. Extensive performance comparisons with ideal, central, and local schemes using real workloads from leading computational centers are conducted in a simulation environment. Additionally, synthetic workloads are used to perform a detailed sensitivity analysis of our superscheduler. Several key metrics demonstrate that substantial performance gains can be achieved via smart superscheduling in distributed computational grids.", acknowledgement = ack-nhfb, } @InProceedings{Jaganathan:2003:CNP, author = "Ranjesh G. Jaganathan and Keith D. Underwood and Ron R. Sass", title = "A Configurable Network Protocol for Cluster Based Communications using Modular Hardware Primitives on an Intelligent {NIC}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#0; http://www.sc-conference.org/sc2003/paperpdfs/pap199.pdf", abstract = "The high overhead of generic protocols like TCP/IP provides strong motivation for the development of a better protocol architecture for cluster-based parallel computers. Reconfigurable computing has a unique opportunity to contribute hardware level protocol acceleration while retaining the flexibility to adapt to changing needs. Specifically, applications on a cluster have various quality of service needs. In addition, these applications typically run for a long time relative to the reconfiguration time of an FPGA. Thus, it is possible to provide application-specific protocol processing to improve performance and reduce space utilization. Reducing space utilization permits the use of a greater portion of the FPGA for other application-specific processing. This paper focuses on work to create a set of parameterizable components that can be put together as needed to obtain a customized protocol for each application. To study the feasibility of such an architecture, hardware components were built that can be stitched together as needed to provide the required functionality. Feasibility is demonstrated using four different protocol configurations, namely: (1) unreliable packet transfer; (2) reliable, unordered message transfer without duplicate elimination; (3) reliable, unordered message transfer with duplicate elimination; and (4) reliable, ordered message transfer with duplicate elimination. The different configurations illustrate trade-offs between chip space and functionality.", acknowledgement = ack-nhfb, } @InProceedings{Feng:2003:OGE, author = "Wu-chun Feng and Justin and Hurwitz and Harvey B. Newman and Sylvain Ravot and Roger Les Cottrell and Olivier Martin and Fabrizio Coccetti and Cheng Jin and David Wei and Steven Low", title = "Optimizing 10-Gigabit {Ethernet} in Networks of Workstations, Clusters, and Grids: {A} Case Study", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#1; http://www.sc-conference.org/sc2003/paperpdfs/pap293.pdf", abstract = "This paper presents a case study of the 10-Gigabit Ethernet (10GbE) adapter from Intel. Specifically, with appropriate optimizations to the configurations of the 10GbE adapter and TCP, we demonstrate that the 10GbE adapter can perform well in local-area, storage-area, system-area, and wide-area networks. For local-area, storage-area, and system-area networks in support of networks of workstations, network-attached storage, and clusters, respectively, we can achieve over 7-Gb/s end-to-end throughput and 12$\mu$s end-to-end latency between applications running on Linux-based PCs. For the wide-area network in support of grids, we broke the recently-set Internet2 Land Speed Record by 2.5 times by sustaining an end-to-end TCP/IP throughput of 2.38 Gb/s between Sunnyvale, California and Geneva, Switzerland (i.e., 10,037 kilometers) to move over a terabyte of data in less than an hour. Thus, the above results indicate that 10GbE may be a cost-effective solution across a multitude of computing environments.", acknowledgement = ack-nhfb, } @InProceedings{Coll:2003:SHB, author = "Salvador Coll and Jose Duato and Fabrizio Petrini and Francisco J. Mora", title = "Scalable Hardware-Based Multicast Trees", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#2; http://www.sc-conference.org/sc2003/paperpdfs/pap300.pdf", abstract = "This paper presents an algorithm for implementing optimal hardware-based multicast trees, on networks that provide hardware support for collective communication. Although the proposed methodology can be generalized to a wide class of networks, we apply our methodology to the Quadrics network, a state-of-the-art network that provides hardware-based multicast communication. The proposed mechanism is intended to improve the performance of the collective communication patterns on the network, in those cases where the hardware support can not be directly used, for instance, due to some faulty nodes. This scheme provides significant reduction on multicast latencies compared to the original system primitives, which use multicast trees based on unicast communication. A backtracking algorithm to find the optimal solution to the problem is presented. In addition, a greedy algorithm is presented and shown to provide near optimal solutions. Finally, our experimental results show the good performance and scalability of the proposed multicast tree in comparison to the traditional unicast-based multicast trees. Our multicast mechanism doubles barrier synchronization and broadcasts performance when compared to the production-level MPI library.", acknowledgement = ack-nhfb, } @InProceedings{Balls:2003:SHS, author = "Gregory T. Balls and Scott B. Baden and Phillip Colella", title = "{SCALLOP}: {A} Highly Scalable Parallel {Poisson} Solver in Three Dimensions", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#0; http://www.sc-conference.org/sc2003/paperpdfs/pap201.pdf", abstract = "SCALLOP is a highly scalable solver and library for elliptic partial differential equations on regular block-structured domains. SCALLOP avoids high communication overheads algorithmically by taking advantage of the locality properties inherent to solutions to elliptic PDEs. Communication costs are small, on the order of a few percent of the total running time on up to 1024 processors of NPACI's and NERSC's IBM Power-3 SP systems. SCALLOP trades off numerical overheads against communication. These numerical overheads are independent of the number of processors for a wide range of problem sizes. SCALLOP is implicitly designed for infinite domain (free space) boundary conditions, but the algorithm can be reformulated to accommodate other boundary conditions. The SCALLOP library is built on top of the KeLP programming system and runs on a variety of platforms.", acknowledgement = ack-nhfb, keywords = "computation-intensive applications; parallel and distributed algorithms; program optimization and performance programming", } @InProceedings{Nakajima:2003:PIS, author = "Kengo Nakajima", title = "Parallel Iterative Solvers of {GeoFEM} with Selective Blocking Preconditioning for Nonlinear Contact Problems on the {Earth Simulator}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1; http://www.sc-conference.org/sc2003/paperpdfs/pap155.pdf", abstract = "An efficient parallel iterative method with selective blocking preconditioning has been developed for symmetric multiprocessor (SMP) cluster architectures with vector processors such as the Earth Simulator. This method is based on a three-level hybrid parallel programming model, which includes message passing for inter-SMP node communication, loop directives by OpenMP for intra-SMP node parallelization and vectorization for each processing element (PE). This method provides robust and smooth convergence and excellent vector and parallel performance in 3D geophysical simulations with contact conditions performed on the Earth Simulator. The selective blocking preconditioning is much more efficient than ILU(1) and ILU(2). Performance for the complicated Southwest Japan model with more than 23 M DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was 161.7 GFLOPS, corresponding to 25.3\% of the peak performance for hybrid programming model, and 190.4 GFLOPS (29.8\% of the peak performance) for flat MPI, respectively.", acknowledgement = ack-nhfb, } @InProceedings{Karypis:2003:MCM, author = "George Karypis", title = "Multi-Constraint Mesh Partitioning for Contact\slash Impact Computations", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#2; http://www.sc-conference.org/sc2003/paperpdfs/pap302.pdf", abstract = "We present a novel approach for decomposing contact/impact computations in which the mesh elements come in contact with each other during the course of the simulation. Effective decomposition of these computations poses a number of challenges as it needs to both balance the computations and minimize the amount of communication that is performed during the finite element and the contact search phase. Our approach achieves the first goal by partitioning the underlying mesh such that it simultaneously balances both the work that is performed during the finite element phase and that performed during contact search phase, while producing subdomains whose boundaries consist of piecewise axes-parallel lines or planes. The second goal is achieved by using a decision tree to decompose the space into rectangular or box-shaped regions that contain contact points from a single partition. Our experimental evaluation on a sequence of 100 meshes, shows that this new approach can reduce the overall communication overhead over existing algorithms.", acknowledgement = ack-nhfb, } @InProceedings{Akcelik:2003:HRF, author = "Volkan Akcelik and Jacobo Bielak and George Biros and Ioannis Epanomeritakis and Antonio Fernandez and Omar Ghattas and Eui Joong Kim and Julio Lopez and David O'Hallaron and Tiankai Tu and John Urbanic", title = "High Resolution Forward and Inverse Earthquake Modeling on Terascale Computers", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#0; http://www.sc-conference.org/sc2003/paperpdfs/pap298.pdf", abstract = "For earthquake simulations to play an important role in the reduction of seismic risk, they must be capable of high resolution and high fidelity. We have developed algorithms and tools for earthquake simulation based on multiresolution hexahedral meshes. We have used this capability to carry out 1 Hz simulations of the 1994 Northridge earthquake in the LA Basin using 100 million grid points. Our wave propagation solver sustains 1.21 teraflop/s for 4 hours on 3000 AlphaServer processors at 80\% parallel efficiency. Because of uncertainties in characterizing earthquake source and basin material properties, a critical remaining challenge is to invert for source and material parameter fields for complex 3D basins from records of past earthquakes. Towards this end, we present results for material and source inversion of high-resolution models of basins undergoing antiplane motion using parallel scalable inversion algorithms that overcome many of the difficulties particular to inverse heterogeneous wave propagation problems.", acknowledgement = ack-nhfb, } @InProceedings{Kim:2003:IHP, author = "Seung Jo Kim and Chang Sung Lee and Jeong Ho Kim and Minsu Joh and Sangsan Lee", title = "{IPSAP} : {A} High-performance Parallel Finite Element Code for Large-scale Structural Analysis Based on Domain-wise Multifrontal Technique", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#1; http://www.sc-conference.org/sc2003/paperpdfs/pap238.pdf", abstract = "Most of researches for large-scale parallel structural analysis have focused on iterative solution methods since direct solution methods generally have many difficulties and disadvantages for large-scale problems. However, due to the numerical robustness of direct methods that guarantees the solution to be obtained within estimated time, direct methods are much more desirable for general application of large-scale structural analysis, if the difficulties and disadvantages can be overcome. In this research, we propose the domain-wise multifrontal solver as an efficient direct solver that can overcome most of these difficulties and disadvantages. By using our own structural analysis code IPSAP which uses the proposed solver, we can solve the largest problem ever solved by direct solvers and can sustain 191 Gflop/s with 256 CPUs on our self-made cluster system, Pegasus. By implementing the block Lanczos algorithm using our solver, IPSAP can solve eigenproblems with 7 millions of DOFs within one hour.", acknowledgement = ack-nhfb, } @InProceedings{Ying:2003:NPK, author = "Lexing Ying and George Biros and Denis Zorin and Harper Langston", title = "A new parallel kernel-independent fast multipole method", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#2; http://www.sc-conference.org/sc2003/paperpdfs/pap166.pdf", abstract = "We present a new adaptive fast multipole algorithm and its parallel implementation. The algorithm is kernel-independent in the sense that the evaluation of pairwise interactions does not rely on any analytic expansions, but only utilizes kernel evaluations. The new method provides the enabling technology for many important problems in computational science and engineering. Examples include viscous flows, fracture mechanics and screened Coulombic interactions. Our MPI-based parallel implementation logically separates the computation and communication phases to avoid synchronization in the upward and downward computation passes, and thus allows us to fully exploit computation and communication overlapping. We measure isogranular and fixed-size scalability for a variety of kernels on the Pittsburgh Supercomputing Center's TCS-1 AlphaServer on up to 3000 processors. We have solved viscous flow problems with up to 2.1 billion unknowns and we have achieved 1.6 Tflops/s peak performance and 1.13 Tflops/s sustained performance.", acknowledgement = ack-nhfb, keywords = "Fast multipole methods; adaptive algorithms; massively parallel computing; boundary integral equations; N-body problems; viscous flows", } @InProceedings{Petrini:2003:CMS, author = "Fabrizio Petrini and Darren J. Kerbyson and Scott Pakin", title = "The Case of the Missing Supercomputer Performance: Achieving Optimal Performance on the 8,192 Processors of {ASCI Q}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#0; http://www.sc-conference.org/sc2003/paperpdfs/pap301.pdf", abstract = "In this paper we describe how we improved the effective performance of ASCI Q, the world's second-fastest supercomputer, to meet our expectations. Using an arsenal of performance-analysis techniques including analytical models, custom microbenchmarks, full applications, and simulators, we succeeded in observing a serious --- but previously undetected --- performance problem. We identified the source of the problem, eliminated the problem, and ``closed the loop'' by demonstrating up to a factor of 2 improvement in application performance. We present our methodology and provide insight into performance analysis that is immediately applicable to other large-scale supercomputers.", acknowledgement = ack-nhfb, } @InProceedings{Dunigan:2003:EEC, author = "Thomas H. {Dunigan, Jr.} and Mark R. Fahey and James B. White III and Patrick H. Worley", title = "Early Evaluation of the {Cray X1}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#1; http://www.sc-conference.org/sc2003/paperpdfs/pap183.pdf", abstract = "Oak Ridge National Laboratory installed a 32 processor Cray X1 in March, 2003, and will have a 256 processor system installed by October, 2003. In this paper we describe our initial evaluation of the X1 architecture, focusing on microbenchmarks, kernels, and application codes that highlight the performance characteristics of the X1 architecture and indicate how to use the system most efficiently.", acknowledgement = ack-nhfb, } @InProceedings{Oliker:2003:ECB, author = "Leonid Oliker and Andrew Canning and Jonathan Carter and John Shalf and David Skinner and Stephane Ethier and Rupak Biswas and Jahed Djomehri and Rob Van der Wijngaart", title = "Evaluation of Cache-based Superscalar and Cacheless Vector Architectures for Scientific Computations", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#2; http://www.sc-conference.org/sc2003/paperpdfs/pap255.pdf", abstract = "The growing gap between sustained and peak performance for scientific applications is a well-known problem in high end computing. The recent development of parallel vector systems offers the potential to bridge this gap for many computational science codes and deliver a substantial increase in computing capabilities. This paper examines the intranode performance of the NEC SX-6 vector processor and the cache-based IBM Power3/4 superscalar architectures across a number of scientific computing areas. First, we present the performance of a microbenchmark suite that examines low-level machine characteristics. Next, we study the behavior of the NAS Parallel Benchmarks. Finally, we evaluate the performance of several scientific computing codes. Results demonstrate that the SX-6 achieves high performance on a large fraction of our applications and often significantly outperforms the cache-based architectures. However, certain applications are not easily amenable to vectorization and would require extensive algorithm and implementation reengineering to utilize the SX-6 effectively.", acknowledgement = ack-nhfb, } @InProceedings{Kee:2003:POP, author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha", title = "{ParADE}: An {OpenMP} Programming Environment for {SMP} Cluster Systems", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0; http://www.sc-conference.org/sc2003/paperpdfs/pap130.pdf", abstract = "Demand for programming environments to exploit clusters of symmetric multiprocessors (SMPs) is increasing. In this paper, we present a new programming environment, called ParADE, to enable easy, portable, and high-performance programming on SMP clusters. It is an OpenMP programming environment on top of a multi-threaded software distributed shared memory (SDSM) system with a variant of home-based lazy release consistency protocol. To boost performance, the runtime system provides explicit message-passing primitives to make it a hybrid-programming environment. Collective communication primitives are used for the synchronization and work-sharing directives associated with small data structures, lessening the synchronization overhead and avoiding the implicit barriers of work-sharing directives. The OpenMP translator bridges the gap between the OpenMP abstraction and the hybrid programming interfaces of the runtime system. The experiments with several NAS benchmarks and applications on a Linux-based cluster show promising results that ParADE overcomes the performance problem of the conventional SDSM-based OpenMP environment.", acknowledgement = ack-nhfb, keywords = "programming environment; SMP cluster; software distributed shared memory; hybrid programming; OpenMP; MPI", } @InProceedings{Weatherly:2003:DMS, author = "D. Brent Weatherly and David K. Lowenthal and Mario Nakazawa and Franklin Lowenthal", title = "{Dyn-MPI}: Supporting {MPI} on Non Dedicated Clusters", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#1; http://www.sc-conference.org/sc2003/paperpdfs/pap126.pdf", abstract = "Distributing data is a fundamental problem in implementing efficient distributed-memory parallel programs. The problem becomes more difficult in environments where the participating nodes are not dedicated to a parallel application. We are investigating the data distribution problem in non dedicated environments in the context of explicit message-passing programs.\par To address this problem, we have designed and implemented an extension to MPI called Dynamic MPI (Dyn-MPI). The key component of Dyn-MPI is its run-time system, which efficiently and automatically redistributes data on the fly when there are changes in the application or the underlying environment. Dyn-MPI supports efficient memory allocation, precise measurement of system load and computation time, and node removal. Performance results show that programs that use Dyn-MPI execute efficiently in non dedicated environments, including up to almost a three-fold improvement compared to programs that do not redistribute data and a 25\% improvement over standard adaptive load balancing techniques.", acknowledgement = ack-nhfb, } @InProceedings{Barker:2003:EFD, author = "Kevin J. Barker and Nikos P. Chrisochoides", title = "An Evaluation of a Framework for the Dynamic Load Balancing of Highly Adaptive and Irregular Parallel Applications", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#2; http://www.sc-conference.org/sc2003/paperpdfs/pap268.pdf", abstract = "We present an evaluation of a flexible framework and runtime software system for the dynamic load balancing of asynchronous and highly adaptive and irregular applications. These applications, which include parallel unstructured and adaptive mesh refinement, serve as building blocks for a large class of scientific applications. Extensive study has lead to the development of solutions to the dynamic load balancing problem for loosely synchronous and computation intensive programs; however, these methods are not suitable for asynchronous and highly adaptive applications. We evaluate a new software framework which includes support for an Active Messages style communication mechanism, global name space, transparent object migration, and preemptive decision making. Our results from both a 3-dimensional parallel advancing front mesh generation program, as well as a synthetic microbenchmark, indicate that this new framework out-performs two existing general-purpose, well-known, and widely used software systems for the dynamic load balancing of adaptive and irregular parallel applications.", acknowledgement = ack-nhfb, } @InProceedings{Romein:2003:MFS, author = "John W. Romein and Jaap Heringa and Henri E. Bal", title = "A Million-Fold Speed Improvement in Genomic Repeats Detection", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#0; http://www.sc-conference.org/sc2003/paperpdfs/pap189.pdf", abstract = "This paper presents a novel, parallel algorithm for generating top alignments. Top alignments are used for finding internal repeats in biological sequences like proteins and genes. Our algorithm replaces an older, sequential algorithm (Repro), which was prohibitively slow for sequence lengths higher than 2000. The new algorithm is an order of magnitude faster ($O(n^3)$ rather than $O(n^4)$). The paper presents a three-level parallel implementation of the algorithm: using SIMD multimedia extensions found on present-day processors (a novel technique that can be used to parallelize any application that performs many sequence alignments), using shared-memory parallelism, and using distributed-memory parallelism. It allows processing the longest known proteins (nearly 35000 amino acids). We show exceptionally high speed improvements: between 548 and 889 on a cluster of 64 dual-processor machines, compared to the new sequential algorithm. Especially for long sequences, extreme speed improvements over the old algorithm are obtained.", acknowledgement = ack-nhfb, } @InProceedings{Chrabakh:2003:GCB, author = "Wahid Chrabakh and Rich Wolski", title = "{GridSAT}: {A} Chaff-based Distributed {SAT} Solver for the Grid", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#1; http://www.sc-conference.org/sc2003/paperpdfs/pap252.pdf", abstract = "We present GridSAT, a parallel and complete satisfiability solver designed to solve non-trivial SAT problem instances using a large number of widely distributed and heterogeneous resources. The GridSAT parallel algorithm uses intelligent backtracking, distributed and carefully scheduled sharing of learned clauses, and clause reduction. Our implementation focuses on dynamic resource acquisition and release to optimize application execution. We show how the large number of computational resources that are available from a Grid can be managed effectively for the application by an automatic scheduler and effective implementation. GridSAT execution speed is compared against the best sequential solver as rated by the SAT2002 competition using a wide variety of problem instances. The results show that GridSAT delivers speed-up for all but one of the test problem instances that are of significant size. In addition, we describe how GridSAT has solved previously unsolved satisfiability problems and the domain science contribution these results make.", acknowledgement = ack-nhfb, keywords = "parallel; distributed; satisfiability; computational grid", } @InProceedings{Vogels:2003:HNC, author = "Werner Vogels", title = "{HPC.NET} --- are {CLI}-based Virtual Machines Suitable for High Performance Computing?", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#2; http://www.sc-conference.org/sc2003/paperpdfs/pap251.pdf", abstract = "The Common Language Infrastructure is a new, standardized virtual machine that is likely to become popular on several platforms. In this paper we review whether this technology has any future in the high-performance computing community, for example by targeting the same application space as the Java-Grande Forum. We review the technology by benchmarking three implementations of the CLI and compare those with the results on Java virtual machines.", acknowledgement = ack-nhfb, } @InProceedings{Makino:2003:PET, author = "Junichiro Makino and Eiichiro Kokubo and Toshiyuki Fukushige and Hiroshi Daisaka", title = "Performance evaluation and tuning of {GRAPE-6} --- towards 40 `real' {Tflops}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#0; http://www.sc-conference.org/sc2003/paperpdfs/pap116.pdf", abstract = "In this paper, we describe the performance characteristics of GRAPE-6, the sixth-generation special-purpose computer for gravitational many-body problems. GRAPE-6 consists of 2048 custom pipeline chips, each of which integrates six pipeline processors specialized for the calculation of gravitational interaction between particles. The GRAPE hardware performs the evaluation of the interaction. The frontend processors perform all other operations, such as the time integration of the orbits of particles, I/O, on-the-fly analysis etc. The theoretical peak speed of GRAPE-6 is 63.4 Tflops. We present the result of benchmark runs, and discuss the performance characteristics. We also present the measured performance for a few real scientific applications. The best performance so far achieved with real applications is 35.3 Tflops.", acknowledgement = ack-nhfb, } @InProceedings{Komatitsch:2003:BDF, author = "Dimitri Komatitsch and Seiji Tsuboi and Chen Ji and Jeroen Tromp", title = "A 14.6 billion degrees of freedom, 5 teraflops, 2.5 terabyte earthquake simulation on the {Earth Simulator}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#1; http://www.sc-conference.org/sc2003/paperpdfs/pap124.pdf", abstract = "We use 1944 processors of the Earth Simulator to model seismic wave propagation resulting from large earthquakes. Simulations are conducted based upon the spectral-element method, a high-degree finite-element technique with an exactly diagonal mass matrix. We use a very large mesh with 5.5 billion grid points (14.6 billion degrees of freedom). We include the full complexity of the Earth, i.e., a three-dimensional wave-speed and density structure, a 3-D crustal model, ellipticity as well as topography and bathymetry. A total of 2.5 terabytes of memory is needed. Our implementation is purely based upon MPI, with loop vectorization on each processor. We obtain an excellent vectorization ratio of 99.3\%, and we reach a performance of 5 teraflops (30\% of the peak performance) on 38\% of the machine. The very high resolution of the mesh allows us to perform fully three-dimensional calculations at seismic periods as low as 5 seconds.", acknowledgement = ack-nhfb, } @InProceedings{Warren:2003:SSM, author = "Michael S. Warren and Chris L. Fryer and M. Patrick Goda", title = "The {Space Simulator}: Modeling the Universe from Supernovae to Cosmology", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#2; http://www.sc-conference.org/sc2003/paperpdfs/pap226.pdf", abstract = "The Space Simulator is a 294-processor Beowulf cluster with theoretical peak performance just below 1.5 Teraflop/s. It is based on the Shuttle XPC SS51G mini chassis. Each node consists of a 2.53 GHz Pentium 4 processor, 1 Gb of 333 MHz DDR SDRAM, an 80 Gbyte Maxtor hard drive, and a 3Com 3C996B-T Gigabit Ethernet card. The network is made up of a Foundry FastIron 1500 and 800 Gigabit Ethernet switch. Each individual node cost less than $1000, and the entire system cost under $500,000. The cluster achieved Linpack performance of 665.1 Gflop/s on 288 processors in October 2002, making it the 85th fastest computer in the world according to the 20th TOP500 list. Performance has since improved to 757.1 Linpack Gflop/s, ranking at \#88 on the 21st TOP500 list. This is the first machine in the TOP500 to surpass Linpack price/performance of 1 dollar per Mflop/s.", acknowledgement = ack-nhfb, } @InProceedings{Dally:2003:MSS, author = "William J. Dally and Patrick Hanrahan and Mattan Erez and Timothy J. Knight and Francois Labonte and Jung-Ho Ahn and Nuwan Jayasena and Ujval J. Kapasi and Abhishek Das and Jayanth Gummaraju and Ian Buck", title = "{Merrimac}: Supercomputing with Streams", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#0; http://www.sc-conference.org/sc2003/paperpdfs/pap246.pdf", abstract = "Merrimac uses stream architecture and advanced interconnection networks to give an order of magnitude more performance per unit cost than cluster-based scientific computers built from the same technology. Organizing the computation into streams and exploiting the resulting locality using a register hierarchy enables a stream architecture to reduce the memory bandwidth required by representative applications by an order of magnitude or more. Hence a processing node with a fixed bandwidth (expensive) can support an order of magnitude more arithmetic units (inexpensive). This in turn allows a given level of performance to be achieved with fewer nodes (a 1-PFLOPS machine, for example, with just 8,192 nodes) resulting in greater reliability, and simpler system management. We sketch the design of Merrimac, a streaming scientific computer that can be scaled from a \$20K 2 TFLOPS workstation to a \$20M 2 PFLOPS supercomputer and present the results of some initial application experiments on this architecture.", acknowledgement = ack-nhfb, } @InProceedings{Taiji:2003:PEP, author = "Makoto Taiji and Tetsu Narumi and Yousuke Ohno and Noriyuki Futatsugi and Atsushi Suenaga and Naoki Takada and Akihiko Konagaya", title = "{Protein Explorer}: {A} Petaflops Special-Purpose Computer System for Molecular Dynamics Simulations", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#1; http://www.sc-conference.org/sc2003/paperpdfs/pap168.pdf", abstract = "We are developing the `Protein Explorer' system, a petaflops special-purpose computer system for molecular dynamics simulations. The Protein Explorer is a PC cluster equipped with special-purpose engines that calculate nonbonded interactions between atoms, which is the most time-consuming part of the simulations. A dedicated LSI `MDGRAPE-3 chip' performs these force calculations at a speed of 165 gigaflops or higher. The system will have 6,144 MDGRAPE-3 chips to achieve a nominal peak performance of one petaflop. The system will be completed in 2006. In this paper, we describe the project plans and the architecture of the Protein Explorer.", acknowledgement = ack-nhfb, } @InProceedings{Anderson:2003:EES, author = "Wendell Anderson and Preston Briggs and C. Stephen Hellberg and Daryl W. Hess and Alexei Khokhlov and Marco Lanzagorta and Robert Rosenberg", title = "Early Experience with Scientific Programs on the {Cray MTA-2}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#2; http://www.sc-conference.org/sc2003/paperpdfs/pap271.pdf", abstract = "We describe our experiences porting and tuning three scientific programs to the Cray MTA-2, paying particular attention to the problems posed by I/O. We have measured the performance of each of the programs over many different machine configurations and we report on the scalability of each program. In addition, we compare the performance of the MTA with that of an SGI Origin running all three programs.", acknowledgement = ack-nhfb, } @InProceedings{Singh:2003:MCS, author = "Gurmeet Singh and Shishir Bharathi and Ann Chervenak and Ewa Deelman and Carl Kesselman and Mary Manohar and Sonal Patil and Laura Pearlman", title = "A Metadata Catalog Service for Data Intensive Applications", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#0; http://www.sc-conference.org/sc2003/paperpdfs/pap242.pdf", abstract = "Advances in computational, storage and network technologies as well as middle ware such as the Globus Toolkit allow scientists to expand the sophistication and scope of data -intensive applications . These applications produce and analyze terabytes and petabytes of data that are distributed in millions of files or objects. To manage these large data sets efficiently , metadata or descriptive information about the data needs to be managed. There are various types of metadata, and it is likely that a range of metadata services will exist in Grid environments that are specialized for particular types of metadata cataloguing and discovery. In this paper, we present the design of a Metadata Catalog Service (MCS) that provides a mechanism for storing and accessing descriptive metadata and allows users to query for data items based on desired attributes. We describe our experience in using the MCS with several applications and present a scalability study of the service.", acknowledgement = ack-nhfb, } @InProceedings{Deelman:2003:GBG, author = "Ewa Deelman and Raymond Plante and Carl Kesselman and Gurmeet Singh and Mei Su and Gretchen Greene and Robert Hanisch and Niall Gaffney and Antonio Volpicelli and James Annis and Vijay Sekhri and Tamas Budavari and Maria Nieto-Santisteban and William O'Mullane and David Bohlender and Tom McGlynn and Arnold Rots and Olga Pevunova", title = "Grid-Based Galaxy Morphology Analysis for the {National Virtual Observatory}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#1; http://www.sc-conference.org/sc2003/paperpdfs/pap282.pdf", abstract = "As part of the development of the National Virtual Observatory (NVO), a Data Grid for astronomy, we have developed a prototype science application to explore the dynamical history of galaxy clusters by analyzing the galaxies' morphologies. The purpose of the prototype is to investigate how Grid-based technologies can be used to provide specialized computational services within the NVO environment. In this paper we focus on the key enabling technology components, particularly Chimera and Pegasus which are used to create and manage the computational workflow that must be present to deal with the challenging application requirements. We illustrate how the components interplay with each other and can be driven from a special purpose application portal.", acknowledgement = ack-nhfb, } @InProceedings{Allen:2003:LPB, author = "Matthew S. Allen and Rich Wolski", title = "The {Livny} and {Plank-Beck} Problems: Studies in Data Movement on the Computational Grid", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#2; http://www.sc-conference.org/sc2003/paperpdfs/pap266.pdf", abstract = "Research on scheduling this data management has focused on both the problem of distributing the storage load among a set of servers and on replication as a way of ensuring reliability and data proximity. In order to store large data sets and keep their load balanced across many hosts, many applications choose to divide these sets into sections and distribute them. To access these files reliably in spite of individual host failures, these sections are frequently replicated across many file servers. While the projects cited above have each explored these problems in different ways, commonalities among the various successful solutions are beginning to emerge. In this paper, we investigate two such commonalities, identified by noted researchers in the field: Dr. Miron Livny [4] from the University of Wisconsin, and Dr. James Plank [2] and Dr. Micah Beck [3] from the University of Tennessee.", acknowledgement = ack-nhfb, } @InProceedings{Jones:2003:ISP, author = "Terry Jones and William Tuel and Larry Brenner and Jeff Fier and Patrick Caffrey and Shawn Dawson and Rob Neely and Robert Blackmore and Brian Maskell and Paul Tomlinson and Mark Roberts", title = "Improving the Scalability of Parallel Jobs by adding Parallel Awareness to the Operating System", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#0; http://www.sc-conference.org/sc2003/paperpdfs/pap136.pdf", abstract = "A parallel application benefits from scheduling policies that include a global perspective of the application's process working set. As the interactions among cooperating processes increase, mechanisms to ameliorate waiting within one or more of the processes become more important. In particular, collective operations such as barriers and reductions are extremely sensitive to even usually harmless events such as context switches among members of the process working set. For the last 18 months, we have been researching the impact of random short-lived interruptions such as timer-decrement processing and periodic daemon activity, and developing strategies to minimize their impact on large processor-count SPMD bulk-synchronous programming styles. We present a novel co-scheduling scheme for improving performance of fine-grain collective activities such as barriers and reductions, describe an implementation consisting of operating system kernel modifications and run-time system, and present a set of empirical results comparing the technique with traditional operating system scheduling. Our results indicate a speedup of over 300\% on synchronizing collectives.", acknowledgement = ack-nhfb, } @InProceedings{Fernandez:2003:BMN, author = "Juan Fernandez and Eitan Frachtenberg and Fabrizio Petrini", title = "{BCS-MPI}: a New Approach in the System Software Design for Large-Scale Parallel Computers", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#1; http://www.sc-conference.org/sc2003/paperpdfs/pap306.pdf", abstract = "Buffered CoScheduled MPI (BCS-MPI) introduces a new approach to design the communication layer for large-scale parallel machines. The emphasis of BCS-MPI is on the global coordination of a large number of communicating processes rather than on the traditional optimization of the point-to-point performance. BCS-MPI delays the interprocessor communication in order to schedule globally the communication pattern and it is designed on top of a minimal set of collective communication primitives. In this paper we describe a prototype implementation of BCS-MPI and its communication protocols. Several experimental results, executed on a set of scientific applications, show that BCS-MPI can compete with a production-level MPI implementation, but is much simpler to implement, debug and model. Keywords: MPI, buffered coscheduling, STORM, Quadrics, system software, communication protocols, cluster computing, large-scale parallel computers.", acknowledgement = ack-nhfb, } @InProceedings{Moody:2003:SNB, author = "Adam Moody and Juan Fernandez and Fabrizio Petrini and Dhabaleswar K. Panda", title = "Scalable {NIC}-based Reduction on Large-Scale Clusters", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#2; http://www.sc-conference.org/sc2003/paperpdfs/pap316.pdf", abstract = "Many parallel algorithms require efficient reduction collectives. In response, researchers have designed algorithms considering a range of parameters including data size, system size, and communication characteristics. Throughout this past work, however, processing was limited to the host CPU. Today, modern Network Interface Cards (NICs) sport programmable processors with substantial memory, and thus introduce a fresh variable into the equation. In this paper, we investigate this new option in the context of large-scale clusters. Through experiments on the 960-node, 1920-processor ASCI Linux Cluster (ALC) at Lawrence Livermore National Laboratory, we show that NIC-based reductions outperform host-based algorithms in terms of reduced latency and increased consistency. In particular, in the largest configuration tested --- 1812 processors --- our NIC-based algorithm summed single-element vectors of 32-bit integers and 64-bit floating-point numbers in 73 $\mu$s and 118 $\mu$s, respectively. These results represent respective improvements of 121\% and 39\% over the production-level MPI library.", acknowledgement = ack-nhfb, } @InProceedings{Worringen:2003:FPN, author = "Joachim Worringen and Jesper Larson Traff and Hubert Ritzdorf", title = "Fast Parallel Non-Contiguous File Access", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#0; http://www.sc-conference.org/sc2003/paperpdfs/pap319.pdf", abstract = "Many applications of parallel I/O perform non-contiguous file accesses: instead of accessing a single (large) block of data in a file, a number of (smaller) blocks of data scattered throughout the file needs to be accessed in each logical I/O operation. However, only few file system interfaces directly support this kind of non-contiguous file access. In contrast, the most commonly used parallel programming interface, MPI, incorporates a flexible model of parallel I/O through its MPI-IO interface. With MPI-IO, arbitrary non-contiguous file accesses are supported in a uniform fashion by the use of derived MPI datatypes set up by the user to reflect the desired I/O pattern.\par Despite a considerable amount of recent work in this area, current MPI-IO implementations suffer from low performance of such non-contiguous accesses when compared to the performance of the storage system for contiguous accesses. In this paper we analyze an important bottleneck in the efficient handling of non-contiguous access patterns in current implementations of MPIIO. We present a new technique, termed listless I/O, that can be incorporated into MPI-IO implementations like the well-known ROMIO implementation, and completely eliminates this bottleneck. We have implemented the technique in MPI/SX, the MPI implementation for the NEC SX-series of parallel vector computers. Results with a synthetic benchmark and an application kernel show that listless I/O is able to increase the bandwidth for non-contiguous file access by sometimes more than a factor of 500 when compared to the traditional approach.", acknowledgement = ack-nhfb, } @InProceedings{Li:2003:PNH, author = "Jianwei Li and Wei-keng Liao and Alok Choudhary and Robert Ross and Rajeev Thakur and William Gropp and Rob Latham and Andrew Siegel and Brad Gallagher and Michael Zingale", title = "{Parallel netCDF}: {A} High-Performance Scientific {I/O} Interface", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#1; http://www.sc-conference.org/sc2003/paperpdfs/pap258.pdf", abstract = "Dataset storage, exchange, and access play a critical role in scientific applications. For such purposes netCDF serves as a portable, efficient file format and programming interface, which is popular in numerous scientific application domains. However, the original interface does not provide an efficient mechanism for parallel data storage and access. In this work, we present a new parallel interface for writing and reading netCDF datasets. This interface is derived with minimal changes from the serial netCDF interface but defines semantics for parallel access and is tailored for high performance. The underlying parallel I/O is achieved through MPI-IO, allowing for substantial performance gains through the use of collective I/O optimizations. We compare the implementation strategies and performance with HDF5. Our tests indicate programming convenience and significant I/O performance improvement with this parallel netCDF (PnetCDF) interface.", acknowledgement = ack-nhfb, } @InProceedings{Klasky:2003:GBP, author = "Scott Alan Klasky and Stephane Ethier and Zhihong Lin and Kevin Martins and Doug McCune and Ravi Samtaney", title = "Grid-Based Parallel Data Streaming implemented for the Gyrokinetic Toroidal Code", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2; http://www.sc-conference.org/sc2003/paperpdfs/pap207.pdf", abstract = "We have developed a threaded parallel data streaming approach using Globus to transfer multi-terabyte simulation data from a remote supercomputer to the scientist's home analysis/visualization cluster, as the simulation executes, with negligible overhead. Data transfer experiments show that this concurrent data transfer approach is more favorable compared with writing to local disk and then transferring this data to be post-processed. The present approach is conducive to using the grid to pipeline the simulation with post-processing and visualization. We have applied this method to the Gyrokinetic Toroidal Code (GTC), a 3-dimensional particle-in-cell code used to study micro-turbulence in magnetic confinement fusion from first principles plasma theory.", acknowledgement = ack-nhfb, } @InProceedings{Wisniewski:2003:EUS, author = "Robert W. Wisniewski and Bryan Rosenburg", title = "Efficient, Unified, and Scalable Performance Monitoring for Multiprocessor Operating Systems", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#0; http://www.sc-conference.org/sc2003/paperpdfs/pap121.pdf", abstract = "Programming, understanding, and tuning the performance of large multiprocessor systems is challenging. Experts have difficulty achieving good utilization for applications on large machines. The task of implementing a scalable system such as an operating system or database on large machines is even more challenging. And the importance of achieving good performance on multiprocessor machines is increasing as the number of cores per chip increases and as the size of multiprocessors increases. Crucial to achieving good performance is being able to understand the behavior of the system.\par We have developed an efficient, unified, and scalable tracing infrastructure that allows for correctness debugging, performance debugging, and performance monitoring of an operating system. The infrastructure allows variable-length events to be logged without locking and provides random access to the event stream. The infrastructure allows cheap and parallel logging of events by applications, libraries, servers, and the kernel. The infrastructure was designed for K42, a new open-source research kernel designed to scale near perfectly on large cache-coherent 64-bit multiprocessor systems. The techniques are generally applicable, and many of them have been integrated into the Linux Trace Toolkit. In this paper, we describe the implementation of the infrastructure, how we used the facility, e.g., analyzing lock contention, to understand and achieve K42's scalable performance, and the lessons we learned. The infrastructure has been invaluable to achieving great scalability.", acknowledgement = ack-nhfb, } @InProceedings{Itzkowitz:2003:MPU, author = "Marty Itzkowitz and Brian J. N. Wylie and Christopher Aoki and Nicolai Kosche", title = "Memory Profiling using Hardware Counters", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#1; http://www.sc-conference.org/sc2003/paperpdfs/pap182.pdf", abstract = "Although memory performance is often a limiting factor in application performance, most tools only show performance data relating to the instructions in the program, not to its data. In this paper, we describe a technique for directly measuring the memory profile of an application. We describe the tools and their user model, and then discuss a particular code, the MCF benchmark from SPEC CPU 2000. We show performance data for the data structures and elements, and discuss the use of the data to improve program performance. Finally, we discuss extensions to the work to provide feedback to the compiler for prefetching and to generate additional reports from the data.", acknowledgement = ack-nhfb, } @InProceedings{Mohan:2003:IES, author = "Tushar Mohan and Bronis R. de Supinski and Sally A. McKee and Frank Mueller and Andy Yoo and Martin Schulz", title = "Identifying and Exploiting Spatial Regularity in Data Memory References", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#2; http://www.sc-conference.org/sc2003/paperpdfs/pap290.pdf", abstract = "The growing processor/memory performance gap causes the performance of many codes to be limited by memory accesses. If known to exist in an application, strided memory accesses forming streams can be targeted by optimizations such as prefetching, relocation, remapping, and vector loads. Undetected, they can be a significant source of memory stalls in loops. Existing stream-detection mechanisms either require special hardware, which may not gather statistics for subsequent analysis, or are limited to compile-time detection of array accesses in loops. Formally, little treatment has been accorded to the subject; the concept of locality fails to capture the existence of streams in a program's memory accesses. The contributions of this paper are as follows. First, we define spatial regularity as a means to discuss the presence and effects of streams. Second, we develop measures to quantify spatial regularity, and we design and implement an on-line, parallel algorithm to detect streams -- and hence regularity -- in running applications. Third, we use examples from real codes and common benchmarks to illustrate how derived stream statistics can be used to guide the application of profile-driven optimizations. Overall, we demonstrate the benefits of our novel regularity metric as an instrument to detect potential for code optimizations affecting memory performance.", acknowledgement = ack-nhfb, } %%% ==================================================================== %%% Cross-referenced entries must come last: @Proceedings{ACM:2003:SII, editor = "{ACM}", booktitle = "SC2003: Igniting Innovation. {Phoenix, AZ, November 15--21, 2003}", title = "{SC2003}: Igniting Innovation. {Phoenix, AZ, November 15--21, 2003}", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "2003", ISBN = "1-58113-695-1", ISBN-13 = "978-1-58113-695-1", LCCN = "????", bibdate = "Thu Feb 21 18:29:36 2003", acknowledgement = ack-nhfb, }