%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.03", %%% date = "25 October 2010", %%% time = "17:23:38 MDT", %%% filename = "supercomputing2002.bib", %%% University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "15932 2233 13182 122527", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "BibTeX, bibliography, SC2002, Supercomputing %%% 2002", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a complete bibliography of papers %%% published in the proceedings of %%% Supercomputing '2002. %%% %%% The conference World-Wide Web site is %%% %%% http://www.sc-2002.org/ %%% %%% The organizers of this conference series %%% maintain a World-Wide Web site at %%% %%% http://www.supercomp.org/ %%% %%% where pointers to Web pages for the %%% conferences from 1988 to date may be found. %%% %%% At version 1.03, the year coverage looked %%% like this: %%% %%% 2002 ( 68) %%% %%% InProceedings: 67 %%% Proceedings: 1 %%% %%% Total entries: 68 %%% %%% In this bibliography, entries are sorted in %%% order of PDF file numbers. %%% %%% The on-line electronic proceedings do not %%% contain sequential page numbers, although %%% there is an ISBN assigned for the %%% proceedings. A pagecount field is given with %%% each entry, extracted from the PDF file: some %%% of the articles lack page numbers altogether, %%% others number pages 1, 2, 3, ... %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Publishers and their addresses: @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} %%% ==================================================================== %%% Bibliography entries. @InProceedings{DeRose:2002:SSI, author = "Luiz DeRose and K. Ekanadham and Jeffrey Hollingsworth and Simone Sbaraglia", title = "{SIGMA}: {A} Simulator Infrastructure to Guide Memory Analysis", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap191.pdf", abstract = "In this paper we present SIGMA (Simulation Infrastructure to Guide Memory Analysis), a new data collection framework and family of cache analysis tools. The SIGMA environment provides detailed cache information by gathering memory reference data using software-based instrumentation. This infrastructure can facilitate quick probing into the factors that influence the performance of an application by highlighting bottleneck scenarios including: excessive cache/TLB misses and inefficient data layouts. The tool can also assist in perturbation analysis to determine performance variations caused by changes to architecture or program. Our validation tests using the SPEC Swim benchmark show that most of the performance metrics obtained with SIGMA are within 1\% of the metrics obtained with hardware performance counters, with the advantage that SIGMA provides performance data on a data structure level, as specified by the programmer.", acknowledgement = ack-nhfb, } @InProceedings{Lu:2002:CAS, author = "Charng-da Lu and Daniel A. Reed", title = "Compact Application Signatures for Parallel and Distributed Scientific Codes", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap244.pdf", abstract = "Understanding the dynamic behavior of parallel programs is key to developing efficient system software and runtime environments; this is even more true on emerging computational Grids where resource availability and performance can change in unpredictable ways. Event tracing provides details on behavioral dynamics, albeit often at great cost. We describe an intermediate approach, based on curve fitting, that retains many of the advantages of event tracing but with lower overhead. These compact ``application signatures'' summarize the time-varying resource needs of scientific codes from historical trace data. We also developed a comparison scheme that measures similarity between two signatures, both across executions and across execution environments.", acknowledgement = ack-nhfb, } @InProceedings{Ahn:2002:SAT, author = "Dong H. Ahn and Jeffrey S. Vetter", title = "Scalable Analysis Techniques for Microprocessor Performance Counter Metrics", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap257.pdf", abstract = "Contemporary microprocessors provide a rich set of integrated performance counters that allow application developers and system architects alike the opportunity to gather important information about workload behaviors. Current techniques for analyzing data produced from these counters use raw counts, ratios, and visualization techniques help users make decisions about their application performance. While these techniques are appropriate for analyzing data from one process, they do not scale easily to new levels demanded by contemporary computing systems. Very simply, this paper addresses these concerns by evaluating several multivariate statistical techniques on these datasets. We find that several techniques, such as statistical clustering, can automatically extract important features from the data. These derived results can, in turn, be fed directly back to an application developer, or used as input to a more comprehensive performance analysis environment, such as a visualization or an expert system.", acknowledgement = ack-nhfb, } @InProceedings{Bailey:2002:HPC, author = "David H. Bailey and David Broadhurst and Yozo Hida and Xiaoye S. Li and Brandon Thompson", title = "High Performance Computing Meets Experimental Mathematics", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Fri Aug 08 11:13:32 2008", URL = "http://www.sc-2002.org/paperpdfs/pap.pap124.pdf", abstract = "In this paper we describe some novel applications of high performance computing in a discipline now known as experimental mathematics. The paper reviews some recent published work, and then presents some new results that have not yet appeared in the literature. A key technique involved in this research is the PSLQ integer relation algorithm (recently named one of ten algorithms of the century by Computing in Science and Engineering). This algorithm permits one to recognize a numeric constant in terms of the formula that it satisfies. We present a variant of PSLQ that is well-suited for parallel computation, and give several examples of new mathematical results that we have found using it. Two of these computations were performed on highly parallel computers, since they are not feasible on conventional systems. We also describe a new software package for performing arbitrary precision arithmetic, which is required in this research.", acknowledgement = ack-nhfb, } @InProceedings{Baumgartner:2002:HLA, author = "Gerald Baumgartner and David E. Bernholdt and Daniel Cociorva and Robert Harrison and So Hirata and Chi-Chung Lam and Marcel Nooijen and Russell Pitzer and J. Ramanujam and P. Sadayappan", title = "A High-Level Approach to Synthesis of High-Performance Codes for Quantum Chemistry", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap306.pdf", abstract = "This paper discusses an approach to the synthesis of high-performance parallel programs for a class of computations encountered in quantum chemistry and physics. These computations are expressible as a set of tensor contractions and arise in electronic structure modeling. An overview is provided of the synthesis system, that transforms a high-level specification of the computation into high-performance parallel code, tailored to the characteristics of the target architecture. An example from computational chemistry is used to illustrate how different code structures are generated under different assumptions of available memory on the target computer.", acknowledgement = ack-nhfb, } @InProceedings{Ding:2002:MOP, author = "Yun He and Chris H. Q. Ding", key = "multidimensional arrays; index reshuffle; vacancy tracking cycles; global exchange; dynamical remapping; MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.", title = "{MPI} and {OpenMP} Paradigms on Cluster of {SMP} Architectures", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf", abstract = "We investigate remapping multi-dimensional arrays on cluster of SMP architectures under OpenMP, MPI, and hybrid paradigms. Traditional method of array transpose needs an auxiliary array of the same size and a copy back stage. We recently developed an in-place method using vacancy tracking cycles. The vacancy tracking algorithm outperforms the traditional 2-array method as demonstrated by extensive comparisons. The independence of vacancy tracking cycles allows efficient parallelization of the in-place method on SMP architectures at node level. Performance of multi-threaded parallelism using OpenMP are tested with different scheduling methods and different number of threads. The vacancy tracking method is parallelized using several parallel paradigms. At node level, pure OpenMP outperforms pure MPI by a factor of 2.76. Across entire cluster of SMP nodes, the hybrid MPI/OpenMP implementation outperforms pure MPI by a factor of 4.44, demonstrating the validity of the parallel paradigm of mixing MPI with OpenMP.", acknowledgement = ack-nhfb, } @InProceedings{Hacker:2002:ESP, author = "Thomas J. Hacker and Brian D. Noble and Brian D. Athey", title = "The Effects of Systemic Packet Loss on Aggregate {TCP} Flows", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap270.pdf", abstract = "The use of parallel TCP connections to increase throughput for bulk transfers is common practice within the high performance computing community. However, the effectiveness, fairness, and efficiency of data transfers across parallel connections is unclear. This paper considers the impact of systemic non-congestion related packet loss on the effectiveness, fairness, and efficiency of parallel TCP transmissions. The results indicate that parallel connections are effective at increasing aggregate throughput, and increase the overall efficiency of the network bottleneck. In the presence of congestion related losses, parallel flows steal bandwidth from other single stream flows. A simple modification is presented that reduces the fairness problems when congestion is present, but retains effectiveness and efficiency.", acknowledgement = ack-nhfb, } @InProceedings{Pradhan:2002:IEQ, author = "Prashant Pradhan and Tzi-cker Chiueh", title = "Implementation and Evaluation of a {QoS}-Capable Cluster-Based {IP} Router", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap320.pdf", abstract = "A major challenge in Internet edge router design is to support both high packet forwarding performance and versatile and efficient packet processing capabilities. The thesis of this research project is that a cluster of PCs connected by a high-speed system area network provides an effective hardware platform for building routers to be used at the edges of the Internet. This paper describes a scalable and extensible edge router architecture called Panama, which supports a novel aggregate route caching scheme, a real-time link scheduling algorithm whose performance overhead is independent of the number of real-time flows, a highly efficient kernel extension mechanism to safely load networking software extensions dynamically, and an integrated resource scheduler which ensures that real-time flows with additional packet processing requirements still meet their end-to-end performance requirements. This paper describes the implementation and evaluation of the first Panama prototype based on a cluster of PCs and Myrinet.", acknowledgement = ack-nhfb, } @InProceedings{Dunigan:2002:TTD, author = "Tom Dunigan and Matt Mathis and Brian Tierney", title = "A {TCP} Tuning Daemon", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap151.pdf", abstract = "Many high performance distributed applications require high network throughput but are able to achieve only a small fraction of the available bandwidth. A common cause of this problem is improperly tuned network settings. Tuning techniques, such as setting the correct TCP buffers and using parallel streams, are well known in the networking community, but outside the networking community they are infrequently applied. In this paper, we describe a tuning daemon that uses TCP instrumentation data from the Unix kernel to transparently tune TCP parameters for specified individual flows over designated paths. No modifications are required to the application, and the user does not need to understand network or TCP characteristics.", acknowledgement = ack-nhfb, keywords = "autotuning; TCP; high-performance networking; data grids", } @InProceedings{Malard:2002:DDH, author = "J. M. Malard and R. D. Stewart", title = "Distributed Dynamic Hash Tables Using {IBM LAPI}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap247.pdf", abstract = "An asynchronous communication library for accessing and managing dynamic hash tables over a network of Symmetric Multiprocessors (SMP) is presented. A blocking factor is shown experimentally to reduce the variance of the wall clock time. It is also shown that remote accesses to a distributed hash table can be as effective and scalable as the one-sided operations of the low-level communication middleware on an IBM SP.", acknowledgement = ack-nhfb, } @InProceedings{Swany:2002:MRP, author = "Martin Swany and Rich Wolski", title = "Multivariate Resource Performance Forecasting in the {Network Weather Service}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap292.pdf", abstract = "This paper describes a new technique in the Network Weather Service for producing multi-variate forecasts. The new technique uses the NWS's univariate forecasters and empirically gathered Cumulative Distribution Functions (CDFs) to make predictions from correlated measurement streams. Experimental results are shown in which throughput is predicted for long TCP/IP transfers from short NWS network probes.", acknowledgement = ack-nhfb, } @InProceedings{Otoo:2002:DCR, author = "Ekow J. Otoo and Frank Olken and Arie Shoshani", title = "Disk Cache Replacement Algorithm for Storage Resource Managers in Data Grids", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap322.pdf", abstract = "We address the problem of cache replacement policies for Storage Resource Managers (SRMs) that are used in Data Grids. An SRM has a disk storage of bounded capacity that retains some N objects. A replacement policy is applied to determine which object in the cache needs to be evicted when space is needed. We define a utility function for ranking the candidate objects for eviction and then describe an efficient algorithm for computing the replacement policy based on this function. This computation takes time $O(\log N)$. We compare our policy with traditional replacement policies such as Least Frequently Used (LFU), Least Recently Used (LRU), LRU-K, Greedy Dual Size (GDS), etc., using simulations of both synthetic and real workloads of file accesses to tertiary storage. Our simulations of replacement policies account for delays in cache space reservation, data transfer and processing. The results obtained show that our proposed method is the most cost effective cache replacement policy for Storage Resource Managers (SRM).", acknowledgement = ack-nhfb, keywords = "file caching; cache replacement algorithm; trace-driven simulation; data staging; storage resource management", } @InProceedings{Radovic:2002:ESN, author = "Zoran Radovic and Erik Hagersten", title = "Efficient Synchronization for Nonuniform Communication Architectures", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap221.pdf", abstract = "Scalable parallel computers are often nonuniform communication architectures (NUCAs), where the access time to other processor's caches vary with their physical location. Still, few attempts of exploring cache-to-cache communication locality have been made. This paper introduces a new kind of synchronization primitives (lock-unlock) that favor neighboring processors when a lock is released. This improves the lock handover time as well as access time to the shared data of the critical region. A critical section guarded by our new RH lock takes less than half the time to execute compared with the same critical section guarded by any other lock on our NUCA hardware. The execution time for Raytrace with 28 processors was improved 2.23--4.68 times, while global traffic was dramatically decreased compared with all the other locks. The average execution time was improved 7--24\% while the global traffic was decreased 8--28\% for an average over the seven applications studied.", acknowledgement = ack-nhfb, } @InProceedings{Sistare:2002:UHP, author = "Steven J. Sistare and Christopher J. Jackson", title = "Ultra-High Performance Communication with {MPI} and the {Sun Fire(\TM)} Link Interconnect", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap142.pdf", abstract = "We present a new low-latency system area network that provides the ultra-high bandwidth needed to fuse a collection of large SMP servers into a capability cluster. The network adapter exports a remote shared memory (RSM) model that supports low latency kernel bypass messaging. The Sun\TM{} MPI library uses the RSM interface to implement a highly efficient memory-to-memory messaging protocol in which the library directly manages buffers and data structures in remote memory. This allows flexible allocation of buffer space to active connections, while avoiding resource contention that could otherwise increase latencies. We discuss the characteristics of the interconnect, describe the MPI protocols, and measure the performance of a number of MPI benchmarks. Our results include MPI inter-node bandwidths of almost 3 Gigabytes per second and MPI ping-pong latencies as low as 3.7 microseconds.", acknowledgement = ack-nhfb, keywords = "interconnects; MPI; kernel bypass; remote shared memory; SAN; performance evaluation", } @InProceedings{Eberle:2002:SHB, author = "Hans Eberle and Nils Gura", title = "Separated High-bandwidth and Low-latency Communication in the Cluster Interconnect {Clint}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap259.pdf", abstract = "An interconnect for a high-performance cluster has to be optimized in respect to both high throughput and low latency. To avoid the tradeoff between throughput and latency, the cluster interconnect Clint has a segregated architecture that provides two physically separate transmission channels: A bulk channel optimized for high-bandwidth traffic and a quick channel optimized for low-latency traffic. Different scheduling strategies are applied. The bulk channel uses a scheduler that globally allocates time slots on the transmission paths before packets are sent off. This way collisions as well as blockages are avoided. In contrast, the quick channel takes a best-effort approach by sending packets whenever they are available thereby risking collisions and retransmissions.\par Simulation results clearly show the performance advantages of the segregated architecture. The carefully scheduled bulk channel can be loaded nearly to its full capacity without exhibiting head-of-line blocking that limits many networks while the quick channel provides low-latency communication even in the presence of high-bandwidth traffic.", acknowledgement = ack-nhfb, } @InProceedings{Vetter:2002:EPE, author = "Jeffrey S. Vetter and Andy Yoo", title = "An Empirical Performance Evaluation of Scalable Scientific Applications", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap222.pdf", abstract = "We investigate the scalability, architectural requirements, and performance characteristics of eight scalable scientific applications. Our analysis is driven by empirical measurements using statistical and tracing instrumentation for both communication and computation. Based on these measurements, we refine our analysis into precise explanations of the factors that influence performance and scalability for each application; we distill these factors into common traits and overall recommendations for both users and designers of scalable platforms. Our experiments demonstrate that some traits, such as improvements in the scaling and performance of MPI's collective operations, will benefit most applications. We also find specific characteristics of some applications that limit performance. For example, one application's intensive use of a 64-bit, floating-point divide instruction, which has high latency and is not pipelined on the POWER3, limits the performance of the application's primary computation.", acknowledgement = ack-nhfb, } @InProceedings{El-Ghazawi:2002:UPP, author = "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet", title = "{UPC} Performance and Potential: {A} {NPB} Experimental Study", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf", abstract = "UPC, or Unified Parallel C, is a parallel extension of ANSI C. UPC follows a distributed shared memory programming model aimed at leveraging the ease of programming of the shared memory paradigm, while enabling the exploitation of data locality. UPC incorporates constructs that allow placing data near the threads that manipulate them to minimize remote accesses. This paper gives an overview of the concepts and features of UPC and establishes, through extensive performance measurements of NPB workloads, the viability of the UPC programming language compared to the other popular paradigms. Further, through performance measurements we identify the challenges, the remaining steps and the priorities for UPC. It will be shown that with proper hand tuning libraries, UPC performance will be comparable incorporating such improvements into automatic compare quite favorably to message passing in ease and optimized collective operations to that of MPI. Furthermore, by compiler optimizations, UPC will of programming.", acknowledgement = ack-nhfb, keywords = "NPB (NAS Parallel Benchmark)", } @InProceedings{Worley:2002:SUC, author = "Patrick H. Worley", title = "Scaling the Unscalable: {A} Case Study on the {AlphaServer SC}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap155.pdf", abstract = "A case study of the optimization of a climate modeling application on the Compaq AlphaServer SC at the Pittsburgh Supercomputer Center is used to illustrate tools and techniques that are important to achieving good performance scaling.", acknowledgement = ack-nhfb, } @InProceedings{Schussman:2002:AVT, author = "Greg Schussman and Brett Wilson and Kwok Ko and Ji Qiang and Robert Ryne and Kwan-Liu Ma", title = "Advanced Visualization Technology for Terascale Particle Accelerator Simulations", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap224.pdf", abstract = "This paper presents two new hardware-assisted rendering techniques developed for interactive visualization of the terascale data generated from numerical modeling of next-generation accelerator designs. The first technique, based on a hybrid rendering approach, makes possible interactive exploration of large-scale particle data from particle beam dynamics modeling. The second technique, based on a compact texture-enhanced representation, exploits the advanced features of commodity graphics cards to achieve perceptually effective visualization of the very dense and complex electromagnetic fields produced from the modeling of reflection and transmission properties of open structures in an accelerator design. Because of the collaborative nature of the overall accelerator modeling project, the visualization technology developed is for both desktop and remote visualization settings. We have tested the techniques using both time-varying particle data sets containing up to one billion particles per time step and electromagnetic field data sets with millions of mesh elements.", acknowledgement = ack-nhfb, keywords = "hardware-assisted techniques; high-performance computing; particle accelerators; perception; point-based rendering; scientific visualization; field lines; texture mapping; time-varying data; vector field visualization; visual cues; volume rendering", } @InProceedings{Wolf:2002:SPS, author = "Matthew Wolf and Zhongtang Cai and Weiyun Huang and Karsten Schwan", title = "{SmartPointers}: Personalized Scientific Data Portals In Your Hand", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap304.pdf", abstract = "The SmartPointer system provides a paradigm for utilizing multiple light-weight client endpoints in a real-time scientific visualization infrastructure. Together, the client and server infrastructure form a new type of data portal for scientific computing. The clients can be used to personalize data for the needs of the individual scientist. This personalization of a shared dataset is designed to allow multiple scientists, each with their laptops or iPaqs to explore the dataset from different angles and with different personalized filters. As an example, iPaq clients can display 2D derived data functions which can be used to dynamically update and annotate the shared data space, which might be visualized separately on a large immersive display such as a CAVE. Measurements are presented for such a system, built upon the ECho middleware system developed at Georgia Tech.", acknowledgement = ack-nhfb, } @InProceedings{Snavely:2002:FPM, author = "Allan Snavely and Laura Carrington and Nicole Wolter and Jesus Labarta and Rosa Badia and Avi Purkayastha", title = "A Framework for Performance Modeling and Prediction", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap201.pdf", abstract = "Cycle-accurate simulation is far too slow for modeling the expected performance of full parallel applications on large HPC systems. And just running an application on a system and observing wallclock time tells you nothing about why the application performs as it does (and is anyway impossible on yet-to-be-built systems). Here we present a framework for performance modeling and prediction that is faster than cycle-accurate simulation, more informative than simple benchmarking, and is shown useful for performance investigations in several dimensions.", acknowledgement = ack-nhfb, } @InProceedings{Gopalan:2002:IRL, author = "Kartik Gopalan and Tzi-cker Chiueh", title = "Improving Route Lookup Performance Using Network Processor Cache", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap272.pdf", abstract = "Earlier research has shown that the route lookup performance of a network processor can be significantly improved by caching ranges of lookup/classification keys rather than individual keys. While the previous work focused specifically on reducing capacity misses, we address two other important aspects --- (a) reducing conflict misses and (b) cache consistency during frequent route updates. We propose two techniques to minimize conflict misses that aim to balance the number of cacheable entries mapped to each cache set. They offer different tradeoffs between performance and simplicity while improving the average route lookup time by 76\% and 45.2\% respectively. To maintain cache consistency during frequent route updates, we propose a selective cache invalidation technique that can limit the degradation in lookup latency to within 10.2\%. Our results indicate potentially large improvement in lookup performance for network processors used at Internet edge and motivate further research into caching at the Internet core.", acknowledgement = ack-nhfb, } @InProceedings{Athanasaki:2002:PST, author = "Maria Athanasaki and Aristidis Sotiropoulos and Georgios Tsoukalas and Nectarios Koziris", title = "Pipelined Scheduling of Tiled Nested Loops onto Clusters of {SMP}s using Memory Mapped Network Interfaces", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap132.pdf", abstract = "This paper describes the performance benefits attained using enhanced network interfaces to achieve low latency communication. We present a novel, pipelined scheduling approach which takes advantage of DMA communication mode, to send data to other nodes, while the CPUs are performing calculations. We also use zero-copy communication through pinned-down physical memory regions, provided by NIC's driver modules. Our testbed concerns the parallel execution of tiled nested loops onto a cluster of SMP nodes with single PCI-SCI NICs inside each node. In order to schedule tiles, we apply a hyperplane-based grouping transformation to the tiled space, so as to group together independent neighboring tiles and assign them to the same SMP node. Experimental evaluation illustrates that memory mapped NICs with enhanced communication features enable the use of a more advanced pipelined (overlapping) schedule, which considerably improves performance, compared to an ordinary blocking schedule, implemented with conventional, CPU and kernel bounded, communication primitives.", acknowledgement = ack-nhfb, keywords = "memory mapped network interfaces; DMA; pipelined schedules; tile grouping; communication overlapping; SMPs", } @InProceedings{Hiraki:2002:DRU, author = "Kei Hiraki and Mary Inaba and Junji Tamatsukuri and Ryutaro Kurusu and Yukichi Ikuta and Hisashi Koga and Akira Zinzaki", title = "Data Reservoir: Utilization of Multi-Gigabit Backbone Network for Data-Intensive Research", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap327.pdf", abstract = "We propose data sharing facility for data intensive scientific research, ``Data Reservoir''; which is optimized to transfer huge amount of data files between distant places fully utilizing multi-gigabit backbone network. In addition, ``Data Reservoir'' can be used as an ordinary UNIX server in local network without any modification of server software. We use low-level protocol and hierarchical striping to realize (1) separation of bulk data transfer and local accesses by caching, (2) file-system transparency, i.e., interoperable whatever in higher layer than disk driver, including file system. (3) scalability for network and storage. This paper shows our design, implementation using iSCSI protocol [1] and their performances for both 1Gbps model in the real network and 10Gbps model in our laboratory.", acknowledgement = ack-nhfb, } @InProceedings{Li:2002:NSA, author = "Laura Grigori and Xiaoye S. Li", title = "A New Scheduling Algorithm For Parallel Sparse {LU} Factorization with Static Pivoting", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap139.pdf", abstract = "In this paper we present a static scheduling algorithm for parallel sparse LU factorization with static pivoting. The algorithm is divided into mapping and scheduling phases, using the symmetric pruned graphs of LT and U to represent dependencies. The scheduling algorithm is designed for driving the parallel execution of the factorization on a distributed-memory architecture. Experimental results and comparisons with SuperLU DIST are reported after applying this algorithm on real world application matrices on an IBM SP RS/6000 distributed memory machine.", acknowledgement = ack-nhfb, } @InProceedings{Vuduc:2002:POB, author = "Richard Vuduc and James W. Demmel and Katherine A. Yelick and Shoaib Kamil and Rajesh Nishtala and Benjamin Lee", title = "Performance Optimizations and Bounds for Sparse Matrix-Vector Multiply", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap317.pdf", abstract = "We consider performance tuning, by code and data structure reorganization, of sparse matrix-vector multiply (SpMxV), one of the most important computational kernels in scientific applications. This paper addresses the fundamental questions of what limits exist on such performance tuning, and how closely tuned code approaches these limits. Specifically, we develop upper and lower bounds on the performance (Mflop/s) of SpMxV when tuned using our previously proposed register blocking optimization. These bounds are based on the non-zero pattern in the matrix and the cost of basic memory operations, such as cache hits and misses. We evaluate our tuned implementations with respect to these bounds using hardware counter data on 4 different platforms and on a test set of 44 sparse matrices. We find that we can often get within 20\% of the upper bound, particularly on a class of matrices from finite element modeling (FEM) problems; on non-FEM matrices, performance improvements of $2\times$ are still possible. Lastly, we present a new heuristic that selects optimal or near-optimal register block sizes (the key tuning parameters) more accurately than our previous heuristic. Using the new heuristic, we show improvements in SpMxV performance (Mflop/s) by as much as $2.5\times$ over an untuned implementation. Collectively, our results suggest that future performance improvements, beyond those that we have already demonstrated for SpMxV, will come from two sources: (1) consideration of higher-level matrix structures (e.g., exploiting symmetry, matrix reordering, multiple register block sizes), and (2) optimizing kernels with more opportunity for data reuse (e.g., sparse matrix-multiple vector multiply, multiplication of AT A by a vector).", acknowledgement = ack-nhfb, } @InProceedings{Teranishi:2002:NDM, author = "Keita Teranishi and Padma Raghavan and Esmond Ng", title = "A New Data-Mapping Scheme For Latency-Tolerant Distributed Sparse Triangular Solution", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap238.pdf", abstract = "This paper concerns latency-tolerant schemes for the efficient parallel solution of sparse triangular linear systems on distributed memory multiprocessors. Such triangular solution is required when sparse Cholesky factors are used to solve for a sequence of right-hand-side vectors or when incomplete sparse Cholesky factors are used to precondition a Conjugate Gradients iterative solver. In such applications, the use of traditional distributed substitution schemes can create a performance bottleneck when the latency of interprocessor communication is large. We had earlier developed the Selective Inversion (SI) scheme to reduce communication latency costs by replacing distributed substitution by parallel matrix vector multiplication. We now present a new two-way mapping of the triangular sparse matrix to processors to improve the performance of SI by halving its communication latency costs. We provide analytic results for model sparse matrices and we report on the performance of our scheme for parallel preconditioning with incomplete sparse Cholesky factors.", acknowledgement = ack-nhfb, } @InProceedings{Traff:2002:IMP, author = "Jesper Larsson Traff", title = "Implementing the {MPI} Process Topology Mechanism", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap122.pdf", abstract = "The topology functionality of the Message Passing Interface (MPI) provides a portable, architecture-independent means for adapting application programs to the communication architecture of the target hardware. However, current MPI implementations rarely go beyond the most trivial implementation, and simply performs no process remapping. We discuss the potential of the topology mechanism for systems with a hierarchical communication architecture like clusters of SMP nodes. The MPI topology functionality is a weak mechanism, and we argue about some of its shortcomings. We formulate the topology optimization problem as a graph embedding problem, and show that for hierarchical systems it can be solved by graph partitioning. We state the properties of a new heuristic for solving both the embedding problem and the ``easier'' graph partitioning problem. The graph partitioning based framework has been fully implemented in MPI/SX for the NEC SX-series of parallel vector computers. MPI/SX is thus one of very few MPI implementations with a non-trivial topology functionality. On a 4 node NEC SX-6 significant communication performance improvements are achieved with synthetic MPI benchmarks.", acknowledgement = ack-nhfb, } @InProceedings{Bosilca:2002:MVT, author = "George Bosilca and Aurelien Bouteiller and Franck Cappello and Samir Djilali and Gilles Fedak and Cecile Germain and Thomas Herault and Pierre Lemarinier and Oleg Lodygensky and Frederic Magniette and Vincent Neri and Anton Selikhov", title = "{MPICH-V}: Toward a Scalable Fault Tolerant {MPI} for Volatile Nodes", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap298.pdf", abstract = "Global Computing platforms, large scale clusters and future TeraGRID systems gather thousands of nodes for computing parallel scientific applications. At this scale, node failures or disconnections are frequent events. This Volatility reduces the MTBF of the whole system in the range of hours or minutes. We present MPICH-V, an automatic Volatility tolerant MPI environment based on uncoordinated checkpoint/ rollback and distributed message logging. MPICH-V architecture relies on Channel Memories, Checkpoint servers and theoretically proven protocols to execute existing or new, SPMD and Master-Worker MPI applications on volatile nodes. To evaluate its capabilities, we run MPICH-V within a framework for which the number of nodes, Channels Memories and Checkpoint Servers can be completely configured as well as the node Volatility. We present a detailed performance evaluation of every component of MPICH-V and its global performance for non-trivial parallel applications. Experimental results demonstrate good scalability and high tolerance to node volatility.", acknowledgement = ack-nhfb, } @InProceedings{Chiu:2002:PMM, author = "Kenneth Chiu and Madhusudhan Govindaraju and Dennis Gannon", title = "The {Proteus Multiprotocol Message Library}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap315.pdf", abstract = "Grid systems span manifold organizations and application domains. Because this diverse environment inevitably engenders multiple protocols, interoperability mechanisms are crucial to seamless, pervasive access. This paper presents the design, rationale, and implementation of the Proteus multiprotocol library for integrating multiple message protocols, such as SOAP and JMS, within one system. Proteus decouples application code from protocol code at run-time, allowing clients to incorporate separately developed protocols without recompiling or halting. Through generic serialization, which separates the transfer syntax from the message type, protocols can also be added independently of serialization routines. We also show performance-enhancing mechanisms for Grid services that examine metadata, but pass actual data through opaquely (such as adapters). The interface provided to protocol implementors is general enough to support protocols as disparate as our current implementations: SOAP, JMS, and binary. Proteus is written in C++; a Java port is planned.", acknowledgement = ack-nhfb, } @InProceedings{Parello:2002:IAA, author = "David Parello and Olivier Temam and Jean-Marie Verdun", title = "On Increasing Architecture Awareness in Program Optimizations to Bridge the Gap between Peak and Sustained Processor Performance -- Matrix-Multiply Revisited", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap107.pdf", abstract = "As the complexity of processor architectures increases, there is a widening gap between peak processor performance and sustained processor performance so that programs now tend to exploit only a fraction of available performance. While there is a tremendous amount of literature on program optimizations, compiler optimizations lack efficiency because they are plagued by three flaws: (1) they often implicitly use simplified, if not simplistic, models of processor architecture, (2) they usually focus on a single processor component (e.g., cache) and ignore the interactions among multiple components, (3) the most heavily investigated components (e.g., caches) sometimes have only a small impact on overall performance. Through the in-depth analysis of a simple program kernel, we want to show that understanding the complex interactions between programs and the numerous processor architecture components is both feasible and critical to design efficient program optimizations.", acknowledgement = ack-nhfb, } @InProceedings{Pike:2002:BTA, author = "Geoff Pike and Paul N. Hilfinger", title = "Better Tiling and Array Contraction for Compiling Scientific Programs", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap260.pdf", abstract = "Scientific programs often include multiple loops over the same data; interleaving parts of different loops may greatly improve performance. We exploit this in a compiler for Titanium, a dialect of Java. Our compiler combines reordering optimizations such as loop fusion and tiling with storage optimizations such as array contraction (eliminating or reducing the size of temporary arrays). The programmers we have in mind are willing to spend some time tuning their code and their compiler parameters. Given that, and the difficulty in statically selecting parameters such as tile sizes, it makes sense to provide automatic parameter searching alongside the compiler. Our strategy is to optimize aggressively but to expose the compiler's decisions to external control. We double or triple the performance of Gauss--Seidel relaxation and multigrid (versus an optimizing compiler without tiling and array contraction), and we argue that ours is the best compiler for that kind of program.", acknowledgement = ack-nhfb, } @InProceedings{Vetter:2002:APE, author = "Jeffrey S. Vetter and Patrick H. Worley", title = "Asserting Performance Expectations", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap275.pdf", abstract = "Traditional techniques for performance analysis provide a means for extracting and analyzing raw performance information from applications. Users then compare this raw data to their performance expectations for application constructs. This comparison can be tedious for the scale of today's architectures and software systems. To address this situation, we present a methodology and prototype that allows users to assert performance expectations explicitly in their source code using performance assertions. As the application executes, each performance assertion in the application collects data implicitly to verify the assertion. By allowing the user to specify a performance expectation with individual code segments, the runtime system can jettison raw data for measurements that pass their expectation, while reacting to failures with a variety of responses. We present several compelling uses of performance assertions with our operational prototype, including raising a performance exception, validating a performance model, and adapting an algorithm empirically at runtime.", acknowledgement = ack-nhfb, } @InProceedings{Makino:2002:TSP, author = "Junichiro Makino and Eiichiro Kokubo and Toshiyuki Fukushige and Hiroshi Daisaka", title = "A {29.5 Tflops} simulation of planetesimals in {Uranus-Neptune} region on {GRAPE-6}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap146.pdf", abstract = "As an entry for the 2002 Gordon Bell performance prize, we report the performance achieved on the GRAPE-6 system for a simulation of the early evolution of the protoplanet-planetesimal system of the Uranus-Neptune region. GRAPE-6 is a special-purpose computer for astrophysical N-body calculations. The present configuration has 2048 custom pipeline chips, each containing six pipeline processors for the calculation of gravitational interactions between particles. Its theoretical peak performance is 63.4 Tflops. The actual performance obtained was 29.5 Tflops, for a simulation of the early evolution of outer Solar system with 1.8 million planetesimals and two massive protoplanets.", acknowledgement = ack-nhfb, } @InProceedings{Bhardwaj:2002:SSS, author = "Manoj Bhardwaj and Kendall Pierson and Garth Reese and Tim Walsh and David Day and Ken Alvin and James Peery and Charbel Farhat and Michel Lesoinne", title = "{Salinas}: {A} Scalable Software for High-Performance Structural and Solid Mechanics Simulations", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap216.pdf", abstract = "We present Salinas, a scalable implicit software application for the finite element static and dynamic analysis of complex structural real-world systems. This relatively complete code and a long list of users engineering software with more than 100,000 lines of sustains 292.5 Gflop/s on 2,940 ASCI Red processors, and 1.16 Tflop/s on 3,375 ASCI White processors.", acknowledgement = ack-nhfb, } @InProceedings{Phillips:2002:NBS, author = "James C. Phillips and Gengbin Zheng and Sameer Kumar and Laxmikant V. Kal{\'e}", title = "{NAMD}: Biomolecular Simulation on Thousands of Processors", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap277.pdf", abstract = "NAMD is a fully featured, production molecular dynamics program for high performance simulation of large biomolecular systems. We have previously, at SC2000, presented scaling results for simulations with cutoff electrostatics on up to 2048 processors of the ASCI Red machine, achieved with an object-based hybrid force and spatial decomposition scheme and an aggressive measurement-based predictive load balancing framework. We extend this work by demonstrating similar scaling on the much faster processors of the PSC Lemieux Alpha cluster, and for simulations employing efficient (order N log N) particle mesh Ewald full electrostatics. This unprecedented scalability in a biomolecular simulation code has been attained through latency tolerance, adaptation to multiprocessor nodes, and the direct use of the Quadrics Elan library in place of MPI by the Charm++/Converse parallel runtime system.", acknowledgement = ack-nhfb, } @InProceedings{Lee:2002:IOG, author = "William Lee and Anthony Mayer and Steven Newhouse", title = "{ICENI}: An {Open Grid Service Architecture} Implemented with {Jini}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap253.pdf", abstract = "The move towards Service Grids, where services are composed to meet the requirements of a user community within constraints specified by the resource provider, present many challenges to service provision and description. To support our research activities in the autonomous composition of services to form a Semantic Service Grid we describe the adoption within ICENI of web services to enable interoperability with the recently proposed Open Grid Services Architecture.", acknowledgement = ack-nhfb, keywords = "Computational Grids; Web Services; Semantic Grid", } @InProceedings{Hoschek:2002:WSD, author = "Wolfgang Hoschek", title = "The {Web Service Discovery Architecture}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap161.pdf", abstract = "In this paper, we propose the Web Service Discovery Architecture (WSDA). At runtime, Grid applications can use this architecture to discover and adapt to remote services. WSDA promotes an interoperable web service discovery layer by defining appropriate services, interfaces, operations and protocol bindings, based on industry standards. It is unified because it subsumes an array of disparate concepts, interfaces and protocols under a single semi-transparent umbrella. It is modular because it defines a small set of orthogonal multipurpose communication primitives (building blocks) for discovery. These primitives cover service identification, service description retrieval, data publication as well as minimal and powerful query support. The architecture is open and flexible because each primitive can be used, implemented, customized and extended in many ways. It is powerful because the individual primitives can be combined and plugged together by specific clients and services to yield a wide range of behaviors and emerging synergies.", acknowledgement = ack-nhfb, keywords = "WSDA (Web Service Discovery Architecture)", } @InProceedings{Pierce:2002:IWS, author = "Marlon Pierce and Geoffrey Fox and Choonhan Youn and Steve Mock and Kurt Mueller and Ozgur Balsoy", title = "Interoperable {Web} Services for Computational Portals", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap284.pdf", abstract = "Computational web portals are designed to simplify access to diverse sets of high performance computing resources, typically through an interface to computational Grid tools. An important shortcoming of these portals is their lack of interoperable and reusable services. This paper presents an overview of research efforts undertaken by our group to build interoperating portal services around a Web Services model. We present a comprehensive view of an interoperable portal architecture, beginning with core portal services that can be used to build Application Web Services, which in turn may be aggregated and managed through portlet containers.", acknowledgement = ack-nhfb, } @InProceedings{Stamatakis:2002:APM, author = "Alexandros P. Stamatakis and Thomas Ludwig and Harald Meier and Marty J. Wolf", title = "Accelerating Parallel Maximum Likelihood-based Phylogenetic Tree Calculations using Subtree Equality Vectors", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap113.pdf", abstract = "Heuristics for calculating phylogenetic trees for a large sets of aligned rRNA sequences based on the maximum likelihood method are computationally expensive. The core of most parallel algorithms, which accounts for the greatest part of computation time, is the tree evaluation function,that calculates the likelihood value for each tree topology. This paper describes and uses Subtree Equality Vectors (SEVs) to reduce the number of required floating point operations during topology evaluation. We integrated our optimizations into various sequential programs and into parallel fastDNAml, one of the most common and efficient parallel programs for calculating large phylogenetic trees. Experimental results for our parallel program, which renders exactly the same output as parallel fastDNAml show global run time improvements of 26\% to 65\%. The optimization scales best on clusters of PCs, which also implies a substantial cost saving factor for the determination of large trees.", acknowledgement = ack-nhfb, } @InProceedings{Akcelik:2002:PMG, author = "Volkan Akcelik and George Biros and Omar Ghattas", title = "Parallel Multiscale {Gauss--Newton--Krylov} Methods for Inverse Wave Propagation", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap330.pdf", abstract = "One of the outstanding challenges of computational science and engineering is large-scale nonlinear parameter estimation of systems governed by partial differential equations. These are known as inverse problems, in contradistinction to the forward problems that usually characterize large-scale simulation. Inverse problems are significantly more difficult to solve than forward problems, due to ill-posedness, large dense ill-conditioned operators, multiple minima, space-time coupling, and the need to solve the forward problem repeatedly. We present a parallel algorithm for inverse problems governed by time-dependent PDEs, and scalability results for an inverse wave propagation problem of determining the material field of an acoustic medium. The difficulties mentioned above are addressed through a combination of total variation regularization, preconditioned matrix-free Gauss--Newton--Krylov iteration, algorithmic checkpointing, and multiscale continuation. We are able to solve a synthetic inverse wave propagation problem though a pelvic bone geometry involving 2.1 million inversion parameters in 3 hours on 256 processors of the Terascale Computing System at the Pittsburgh Supercomputing Center.", acknowledgement = ack-nhfb, } @InProceedings{Hariharan:2002:SPF, author = "Bhanu Hariharan and Srinivas Aluru and Balasubramaniam Shanker", title = "A Scalable Parallel Fast Multipole Method for Analysis of Scattering from Perfect Electrically Conducting Surfaces", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap295.pdf", abstract = "In this paper, we develop a parallel Fast Multipole Method (FMM) based solution for computing the scattered electromagnetic fields from a Perfect Electrically Conducting (PEC) surface. The main contributions of this work are the development of parallel algorithms with the following characteristics: (1) provably efficient worst-case run-time irrespective of the shape of the scatterer, (2) communication efficiency, and (3) guaranteed load balancing within a small constant factor. We have developed a scalable, parallel code and validated it against surfaces for which solution can be computed analytically, and against serial software. The efficiency and scalability of the code is demonstrated with experimental results on an IBM xSeries cluster. Though developed in the context of this particular application, our algorithms can be used in other applications involving parallel FMM.", acknowledgement = ack-nhfb, } @InProceedings{Karniadakis:2002:DLP, author = "Suchuan Dong and George Em. Karniadakis", title = "Dual-Level Parallelism for Deterministic and Stochastic {CFD} Problems", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf", abstract = "A hybrid two-level parallelism using MPI/OpenMP is implemented in the general-purpose spectral/hp element CFD code NekTar to take advantage of the hierarchical structures arising in deterministic and stochastic CFD problems. We take a coarse grain approach to shared-memory parallelism with OpenMP and employ a workload-splitting scheme that can reduce the OpenMP synchronizations to the minimum. The hybrid implementation shows good scalability with respect to both the problem size and the number of processors in case of a fixed problem size. With the same number of processors, the hybrid model with 2 (or 4) OpenMP threads per MPI process is observed to perform better than pure MPI and pure OpenMP on the NCSA SGI Origin 2000, while the pure MPI model performs the best on the IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC. A key new result is that the use of threads facilitates effectively prefinement, which is crucial to adaptive discretization using high-order methods.", acknowledgement = ack-nhfb, } @InProceedings{Tapus:2002:AHT, author = "Cristian T{\u{a}}pu{\c{s}} and I-Hsin Chung and Jeffrey K. Hollingsworth", title = "{Active Harmony}: Towards Automated Performance Tuning", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap138.pdf", abstract = "In this paper, we present the Active Harmony automated runtime tuning system. We describe the interface used by programs to make applications tunable. We present the Library Specification Layer which helps program library developers expose multiple variations of the same API using different algorithms. The Library Specification Language helps to select the most appropriate program library to tune the overall performance. We also present the optimization algorithm used to adjust parameters in the application and the libraries. Finally, we present results that show how the system is able to tune several real applications. The automated tuning system is able to tune the application parameters to within a few percent of the best value after evaluating only 11 out of over 1,700 possible configurations.", acknowledgement = ack-nhfb, } @InProceedings{Rauber:2002:LSH, author = "Thomas Rauber and Gudula R{\"u}nger", title = "Library Support for Hierarchical Multi-Processor Tasks", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap176.pdf", abstract = "The paper considers the modular programming with hierarchically structured multi-processor tasks on top of SPMD tasks for distributed memory machines. The parallel execution requires a corresponding decomposition of the set of processors into a hierarchical group structure onto which the tasks are mapped. This results in a multi-level group SPMD computation model with varying processor group structures. The advantage of this kind of mixed task and data parallelism is a potential to reduce the communication overhead and to increase scalability. We present a runtime library to support the coordination of hierarchically structured multi-processor tasks. The library exploits an extended parallel group SPMD programming model and manages the entire task execution including the dynamic hierarchy of processor groups. The library is built on top of MPI, has an easy-to-use interface, and leads to only a marginal overhead while allowing static planning and dynamic restructuring. Keywords: mixed task and data parallelism, multiprocessor tasks, multilevel group SPMD, hierarchical decomposition of processor sets, library support, distributed memory", acknowledgement = ack-nhfb, } @InProceedings{Frachtenberg:2002:SLF, author = "Eitan Frachtenberg and Fabrizio Petrini and Juan Fernandez and Salvador Coll and Scott Pakin", title = "{STORM}: Lightning-Fast Resource Management", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap297.pdf", abstract = "Although workstation clusters are a common platform for high-performance computing (HPC), they remain more difficult to manage than sequential systems or even symmetric multiprocessors. Furthermore, as cluster sizes increase, the quality of the resource-management subsystem --- essentially, all of the code that runs on a cluster other than the applications --- increasingly impacts application efficiency. In this paper, we present STORM, a resource-management framework designed for scalability and performance. The key innovation behind STORM is a software architecture that enables resource management to exploit low-level network features. As a result of this HPC-application-like design, STORM is orders of magnitude faster than the best reported results in the literature on two sample resource-management functions: job launching and process scheduling.", acknowledgement = ack-nhfb, } @InProceedings{Colarelli:2002:MAI, author = "Dennis Colarelli and Dirk Grunwald", title = "Massive Arrays of Idle Disks For Storage Archives", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap312.pdf", abstract = "The declining costs of commodity disk drives is rapidly changing the economics of deploying large amounts of online or near-line storage. Conventional mass storage systems use either high performance RAID clusters, automated tape libraries or a combination of tape and disk. In this paper, we analyze an alternative design using massive arrays of idle disks, or MAID. We argue that this storage organization provides storage densities matching or exceeding those of tape libraries with performance similar to disk arrays. Moreover, we show that with effective power management of individual drives, this performance can be achieved using a very small power budget. In particular, we show that our power management strategy can result in the performance comparable to an always-on RAID system while using $1/15$th the power of such a RAID system.", acknowledgement = ack-nhfb, } @InProceedings{Sterling:2002:GMP, author = "Thomas L. Sterling and Hans P. Zima", title = "{Gilgamesh}: {A} Multithreaded Processor-In-Memory Architecture for Petaflops Computing", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf", abstract = "Processor-in-Memory (PIM) architectures avoid the von Neumann bottleneck in conventional machines by integrating high-density DRAM and CMOS logic on the same chip. Parallel systems based on this new technology are expected to provide higher scalability, adaptability, robustness, fault tolerance and lower power consumption than current MPPs or commodity clusters. In this paper we describe the design of Gilgamesh, a PIM-based massively parallel architecture, and elements of its execution model. Gilgamesh extends existing PIM capabilities by incorporating advanced mechanisms for virtualizing tasks and data and providing adaptive resource management for load balancing and latency tolerance. The Gilgamesh execution model is based on macroservers, a middleware layer which supports object-based runtime management of data and threads allowing explicit and dynamic control of locality and load balancing. The paper concludes with a discussion of related research activities and an outlook to future work.", acknowledgement = ack-nhfb, } @InProceedings{Acacio:2002:OPA, author = "Manuel E. Acacio and Jose Gonzalez and Jose M. Garcia and Jose Duato", title = "Owner Prediction for Accelerating Cache-to-Cache Transfer Misses in a cc-{NUMA} Architecture", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap102.pdf", abstract = "Cache misses for which data must be obtained from a remote cache (cache-to-cache transfer misses) account for an important fraction of the total miss rate. Unfortunately, cc-NUMA designs put the access to the directory information into the critical path of 3-hop misses, which significantly penalizes them compared to SMP designs. This work studies the use of owner prediction as a means of providing cc-NUMA multiprocessors with a more efficient support for cache-to-cache transfer misses. Our proposal comprises an effective prediction scheme as well as a coherence protocol designed to support the use of prediction. Results indicate that owner prediction can significantly reduce the latency of cache-to-cache transfer misses, which translates into speed-ups on application performance up to 12\%. In order to also accelerate most of those 3-hop misses that are either not predicted or mispredicted, the inclusion of a small and fast directory cache in every node is evaluated, leading to improvements up to 16\% on the final performance.", acknowledgement = ack-nhfb, } @InProceedings{Ishihara:2002:TDN, author = "Mitsuo Yokokawa and Ken'ichi Itakura and Atsuya Uno and Takashi Ishihara and Yukio Kaneda", title = "{16.4 Tflops} Direct Numerical Simulation of Turbulence by {Fourier} Spectral Method on the {Earth Simulator}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap273.pdf", abstract = "The high-resolution direct numerical simulations (DNSs) of incompressible turbulence with numbers of grid points up to 40963 have been executed on the Earth Simulator (ES). The DNSs are based on the Fourier spectral method, so that the equation for mass conservation is accurately solved. In DNS based on the spectral method, most of the computation time is consumed in calculating the three-dimensional (3D) Fast Fourier Transform (FFT), which requires huge-scale global data transfer and has been the major stumbling block that has prevented truly high-performance computing. By implementing new methods to efficiently perform the 3D-FFT on the ES, we have achieved DNS at 16.4 Tflops on 20483 grid points. The DNS yields an energy spectrum exhibiting a wide inertial subrange, in contrast to previous DNSs with lower resolutions, and therefore provides valuable data for the study of the universal features of turbulence at large Reynolds number.", acknowledgement = ack-nhfb, } @InProceedings{Sakagami:2002:TTD, author = "Hitoshi Sakagami and Hitoshi Murai and Yoshiki Seo and Mitsuo Yokokawa", title = "{14.9 TFLOPS} Three-dimensional Fluid Simulation for Fusion Science with {HPF} on the {Earth Simulator}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap147.pdf", abstract = "We succeeded in getting 14.9 TFLOPS performance when running a plasma simulation code IMPACT-3D parallelized with High Performance Fortran on 512 nodes of the Earth Simulator. The theoretical peak performance of the 512 nodes is 32 TFLOPS, which means 45\% of the peak performance was obtained with HPF. IMPACT-3D is an implosion analysis code using TVD scheme, which performs three-dimensional compressible and inviscid Eulerian fluid computation with the explicit 5-point stencil scheme for spatial differentiation and the fractional time step for time integration. The mesh size is 2048x2048x4096, and the third dimension was distributed for the parallelization. The HPF system used in the evaluation is HPF/ES, developed for the Earth Simulator by enhancing NEC HPF/SX V2 mainly in communication scalability. Shift communications were manually tuned to get best performance by using HPF/JA extensions, which was designed to give the users more control over sophisticated parallelization and communication optimizations.", acknowledgement = ack-nhfb, } @InProceedings{Shingu:2002:TGA, author = "Satoru Shingu and Hiroshi Takahara and Hiromitsu Fuchigami and Masayuki Yamada and Yoshinori Tsuda and Wataru Ohfuchi and Yuji Sasaki and Kazuo Kobayashi and Takashi Hagiwara and Shin-ichi Habata and Mitsuo Yokokawa and Hiroyuki Itoh and Kiyoshi Otsuka", title = "A {26.58 Tflops} Global Atmospheric Simulation with the Spectral Transform Method on the {Earth Simulator}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap331.pdf", abstract = "A spectral atmospheric general circulation model called AFES (AGCM for Earth Simulator) was developed and optimized for the architecture of the Earth Simulator (ES). The ES is a massively parallel vector supercomputer that consists of 640 processor nodes interconnected by a single stage crossbar network with its total peak performance of 40.96 Tflops. The sustained performance of 26.58 Tflops was achieved for a high resolution simulation (T1279L96) with AFES by utilizing the full 640-node configuration of the ES. The resulting computing efficiency is 64.9\% of the peak performance, well surpassing that of conventional weather/climate applications having just 25--50\% efficiency even on vector parallel computers. This remarkable performance proves the effectiveness of the ES as a viable means for practical applications.", acknowledgement = ack-nhfb, } @InProceedings{Noordergraaf:2002:SSI, author = "Lisa Noordergraaf and Robert Zak", title = "{SMP} System Interconnect Instrumentation for Performance Analysis", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap158.pdf", abstract = "The system interconnect is often the performance bottleneck in SMP computers. Although modern SMPs include event counters on processors and interconnects, these provide limited information about the interaction of processors vying for shared resources. Additionally, transaction sources and addresses are not readily available, making analysis of access patterns and data locality difficult. Enhanced system interconnect instrumentation is required to extract this information.\par This paper describes instrumentation implemented for monitoring the system interconnect on Sun Fire\TM{} servers. The instrumentation supports sophisticated programmable filtering of event counters, allowing us to construct histograms of system interconnect activity, and a FIFO to capture trace sequences. Our implementation results in a very small hardware footprint, making it appropriate for inclusion in commodity hardware.\par We also describe a sampling of software tools and results based on this infrastructure. Applications have included performance profiling, architectural studies, and hardware bringup and debugging.", acknowledgement = ack-nhfb, } @InProceedings{Spencer:2002:EMP, author = "Matthew Spencer and Renato Ferreira and Michael Beynon and Tahsin Kurc and Umit Catalyurek and Alan Sussman and Joel Saltz", title = "Executing Multiple Pipelined Data Analysis Operations in the Grid", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap258.pdf", abstract = "Processing of data in many data analysis applications can be represented as an acyclic, coarse grain data flow, from data sources to the client. This paper is concerned with scheduling of multiple data analysis operations, each of which is represented as a pipelined chain of processing on data. We define the scheduling problem for effectively placing components onto Grid resources, and propose two scheduling algorithms. Experimental results are presented using a visualization application.", acknowledgement = ack-nhfb, } @InProceedings{Dail:2002:DSA, author = "Holly Dail and Henri Casanova and Fran Berman", title = "A Decoupled Scheduling Approach for the {GrADS} Program Development Environment", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap229.pdf", abstract = "Program development environments are instrumental in providing users with easy and efficient access to parallel computing platforms. While a number of such environments have been widely accepted and used for traditional HPC systems, there are currently no widely used environments for Grid programming. The goal of the Grid Application Development Software (GrADS) project is to develop a coordinated set of tools, libraries and run-time execution facilities for Grid program development. In this paper, we describe a Grid scheduler component that is integrated as part of the GrADS software system. Traditionally, application-level schedulers (e.g. AppLeS) have been tightly integrated with the application itself and were not easily applied to other applications. Our design is generic: we decouple the scheduler core (the search procedure) from the application-specific (e.g. application performance models) and platform-specific (e.g. collection of resource information) components used by the search procedure. We provide experimental validation of our approach for two representative regular, iterative parallel programs in a variety of real-world Grid testbeds. Our scheduler consistently outperforms static and user-driven scheduling methods.", acknowledgement = ack-nhfb, } @InProceedings{Annis:2002:ACV, author = "James Annis and Yong Zhao and Jens Voeckler and Michael Wilde and Steve Kent and Ian Foster", title = "Applying {Chimera} Virtual Data Concepts to Cluster Finding in the {Sloan Sky Survey}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap299.pdf", abstract = "In many scientific disciplines --- especially long running, data-intensive collaborations --- it is important to track all aspects of data capture, production, transformation, and analysis. In principle, one can then audit, validate, reproduce, and/or re-run with corrections various data transformations. We have recently proposed and prototyped the Chimera virtual data system, a new database-driven approach to this problem. We present here a major application study in which we apply Chimera to a challenging data analysis problem: the identification of galaxy clusters within the Sloan Digital Sky Survey. We describe the problem, its computational procedures, and the use of Chimera to plan and orchestrate the workflow of thousands of tasks on a data grid comprising hundreds of computers. This experience suggests that a general set of tools can indeed enhance the accuracy and productivity of scientific data reduction and that further development and application of this paradigm will offer great value.", acknowledgement = ack-nhfb, } @InProceedings{Andrade:2002:APG, author = "Henrique Andrade and Tahsin Kurc and Alan Sussman and Joel Saltz", title = "{Active Proxy-G}: Optimizing the Query Execution Process in the Grid", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap219.pdf", abstract = "The Grid environment facilitates collaborative work and allows many users to query and process data over geographically dispersed data repositories. Over the past several years, there has been a growing interest in developing applications that interactively analyze datasets, potentially in a collaborative setting. We describe the Active Proxy-G service that is able to cache query results, use those results for answering new incoming queries, generate subqueries for the parts of a query that cannot be produced from the cache, and submit the subqueries for final processing at application servers that store the raw datasets. We present an experimental evaluation to illustrate the effects of various design tradeoffs. We also show the benefits that two real applications gain from using the middleware.", acknowledgement = ack-nhfb, } @InProceedings{Chervenak:2002:GFC, author = "Ann Chervenak and Ewa Deelman and Ian Foster and Leanne Guy and Wolfgang Hoschek and Adriana Iamnitchi and Carl Kesselman and Peter Kunszt and Matei Ripeanu and Bob Schwartzkopf and Heinz Stockinger and Kurt Stockinger and Brian Tierney", title = "{Giggle}: {A} Framework for Constructing Scalable Replica Location Services", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap239.pdf", abstract = "In wide area computing systems, it is often desirable to create remote read-only copies (replicas) of files. Replication can be used to reduce access latency, improve data locality, and/or increase robustness, scalability and performance for distributed applications. We define a replica location service (RLS) as a system that maintains and provides access to information about the physical locations of copies. An RLS typically functions as one component of a data grid architecture. This paper makes the following contributions. First, we characterize RLS requirements. Next, we describe a parameterized architectural framework, which we name Giggle (for GIGa-scale Global Location Engine), within which a wide range of RLSs can be defined. We define several concrete instantiations of this framework with different performance characteristics. Finally, we present initial performance results for an RLS prototype, demonstrating that RLS systems can be constructed that meet performance goals.", acknowledgement = ack-nhfb, } @InProceedings{Bland:2002:EEI, author = "P. H. Worley and T. H. {Dunigan, Jr.} and M. R. Fahey and J. B. {White III} and A. S. Bland", title = "Early Evaluation of the {IBM p690}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap196.pdf", abstract = "Oak Ridge National Laboratory recently received 27 32-way IBM pSeries 690 SMP nodes. In this paper, we describe our initial evaluation of the p690 architecture, focusing on the performance of benchmarks and applications that are representative of the expected production workload.", acknowledgement = ack-nhfb, } @InProceedings{Adiga:2002:OBS, author = "N. R. Adiga and G. Almasi and G. S. Almasi and Y. Aridor and R. Barik and D. Beece and R. Bellofatto and G. Bhanot and R. Bickford and M. Blumrich and A. A. Bright and J. Brunheroto and C. Cacaval and J. Casta{\~n}os and W. Chan and L. Ceze and P. Coteus and S. Chatterjee and D. Chen and G. Chiu and T. M. Cipolla and P. Crumley and K. M. Desai and A. Deutsch and T. Domany and M. B. Dombrowa and W. Donath and M. Eleftheriou and C. Erway and J. Esch and B. Fitch and J. Gagliano and A. Gara and R. Garg and R. Germain and M. E. Giampapa and B. Gopalsamy and J. Gunnels and M. Gupta and F. Gustavson and S. Hall and R. A. Haring and D. Heidel and P. Heidelberger and L. M. Herger and D. Hoenicke and R. D. Jackson and T. Jamal-Eddine and G. V. Kopcsay and E. Krevat and M. P. Kurhekar and A. P. Lanzetta and D. Lieber and L. K. Liu and M. Lu and M. Mendell and A. Misra and Y. Moatti and L. Mok and J. E. Moreira and B. J. Nathanson and M. Newton and M. Ohmacht and A. Oliner and V. Pandit and R. B. Pudota and R. Rand and R. Regan and B. Rubin and A. Ruehli and S. Rus and R. K. Sahoo and A. Sanomiya and E. Schenfeld and M. Sharma and E. Shmueli and S. Singh and P. Song and V. Srinivasan and B. D. Steinmacher-Burow and K. Strauss and C. Surovic and R. Swetz and T. Takken and R. B. Tremaine and M. Tsao and A. R. Umamaheshwaran and P. Verma and P. Vranas and T. J. C. Ward and M. Wazlowski and W. Barrett and C. Engel and B. Drehmel and B. Hilgart and D. Hill and F. Kasemkhani and D. Krolak and C. T. Li and T. Liebsch and J. Marcella and A. Muff and A. Okomo and M. Rouse and A. Schram and M. Tubbs and G. Ulsh and C. Wait and J. Wittrup and M. Bae and K. Dockser and L. Kissel and M. K. Seager and J. S. Vetter and K. Yates", title = "An Overview of the {BlueGene/L} Supercomputer", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap207.pdf", abstract = "This paper gives an overview of the BlueGene/L Supercomputer. This is a jointly funded research partnership between IBM and the Lawrence Livermore National Laboratory as part of the United States Department of Energy ASCI Advanced Architecture Research Program. Application performance and scaling studies have recently been initiated with partners at a number of academic and government institutions, including the San Diego Supercomputer Center and the California Institute of Technology. This massively parallel system of 65,536 nodes is based on a new architecture that exploits system-on-a-chip technology to deliver target peak processing power of 360 teraFLOPS (trillion floating-point operations per second). The machine is scheduled to be operational in the 2004-2005 time frame, at price/performance and power consumption/performance targets unobtainable with conventional architectures.", acknowledgement = ack-nhfb, } @InProceedings{Warren:2002:HDC, author = "Michael S. Warren and Eric H. Weigle and Wu-Chun Feng", title = "High-Density Computing: {A} 240-Processor {Beowulf} in One Cubic Meter", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap210.pdf", abstract = "We present results from computations on Green Destiny, a 240-processor Beowulf cluster which is contained entirely within a single 19-inch wide 42U rack. The cluster consists of 240 Transmeta TM5600 667-MHz CPUs mounted on RLX Technologies motherboard blades. The blades are mounted side-by-side in an RLX 3U rack-mount chassis, which holds 24 blades. The overall cluster contains 10 chassis and associated Fast and Gigabit Ethernet switches. The system has a footprint of 0.5 meter2 (6 square feet), a volume of 0.85 meter3 (30 cubic feet) and a measured power dissipation under load of 5200 watts (including network switches). We have measured the performance of the cluster using a gravitational treecode N-body simulation of galaxy formation using 200 million particles, which sustained an average of 38.9 Gflops on 212 nodes of the system. We also present results from a three-dimensional hydrodynamic simulation of a core-collapse supernova", acknowledgement = ack-nhfb, keywords = "Beowulf; cluster; blade server; RLX; Transmeta; code morphing; VLIW; performance-per-square-foot; MIPS-per-watt", } @InProceedings{Kim:2002:UDC, author = "Seung Jo Kim and Joon-Seok Hwang and Chang Sung Lee and Sangsan Lee", title = "Utilization of Departmental Computing {GRID} System for Development of an Artificial Intelligent Tapping Inspection Method, Tapping Sound Analysis", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap167.pdf", abstract = "Tapping Sound Analysis is a new NDE method, which determines the existence of subsurface defects by comparing the tapping sound of test structure and original healthy structure. The tapping sound of original healthy structure is named sound print of the structure and is obtained through high precision computation. Because many tapping points are required to obtain the exact sound print data, many times of tapping sound simulation are required. The simulation of tapping sound requires complicated numerical procedures. Departmental Computing GRID system was utilized to run numerical simulations. Three cluster systems and one PC-farm system comprise DCG system. Tapping sound simulations were launched and monitored through Globus and CONDOR. A total of 160 Tera floating-point (double-precision) operations was performed and the elapsed time was 41,880 sec. From the numerical experiments, Grid computing technology reduced the necessary time to make sound print database and made TSA a feasible and practical methodology.", acknowledgement = ack-nhfb, } @InProceedings{Kikuchi:2002:CSG, author = "Hideaki Kikuchi and Rajiv K. Kalia and Aiichiro Nakano and Priya Vashishta and Hiroshi Iyetomi and Shuji Ogata and Takahisa Kouno and Fuyuki Shimojo and Kenji Tsuruta and Subhash Saini", title = "Collaborative Simulation Grid: Multiscale Quantum-Mechanical\slash Classical Atomistic Simulations on Distributed {PC} Clusters in the {US} and {Japan}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap111.pdf", abstract = "A multidisciplinary, collaborative simulation has been performed on a Grid of geographically distributed PC clusters. The multiscale simulation approach seamlessly combines (i) atomistic simulation based on the molecular dynamics (MD) method and (ii) quantum mechanical (QM) calculation based on the density functional theory (DFT), so that accurate but less scalable computations are performed only where they are needed. The multiscale MD/QM simulation code has been Grid-enabled using (i) a modular, additive hybridization scheme, (ii) multiple QM clustering, and (iii) computation/communication overlapping. The Gridified MD/QM simulation code has been used to study environmental effects of water molecules on fracture in silicon. A preliminary run of the code has achieved a parallel efficiency of 94\% on 25 PCs distributed over 3 PC clusters in the US and Japan, and a larger test involving 154 processors on 5 distributed PC clusters is in progress.", acknowledgement = ack-nhfb, keywords = "Grid application; multiscale simulation; molecular dynamics;quantum mechanics; density functional theory", } @InProceedings{Baldridge:2002:QGI, author = "Kim K. Baldridge and Jerry P. Greenberg and Stephen T. Elbert and Stephen Mock and Philip Papadopoulos", title = "{QMView} and {GAMESS}: Integration into the {World Wide Computational Grid}", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap141.pdf", abstract = "High performance computing, storage, visualization, and database infrastructures are increasing geometrically in complexity as scientists move towards grid-based computing. While this is natural, it has the effect of pushing computational capabilities beyond the reach of scientists because of the time needed to harness the infrastructure. Hiding the complexity of networked resources becomes essential if scientists are to utilize them effectively. In this work, we describe our efforts to integrate various computational chemistry components into a scientific computing environment. We briefly describe improvements we have made to individual components of the chemistry environment as well as future directions, followed by a more in-depth discussion of our strategy for integration into a grid workflow environment based on web services, which enables access to remote resources while shielding users from the complexities of the grid infrastructures. A preliminary schema for storing data obtained from computational chemistry calculations is also described.", acknowledgement = ack-nhfb, } @InProceedings{Bustamante:2002:SDS, author = "Fabian E. Bustamante and Patrick Widener and Karsten Schwan", title = "Scalable Directory Services Using Proactivity", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap202.pdf", abstract = "Common to computational grids and pervasive computing is the need for an expressive, efficient, and scalable directory service that provides information about objects in the environment. We argue that a directory interface that `pushes' information to clients about changes to objects can significantly improve scalability. This paper describes the design, implementation, and evaluation of the Proactive Directory Service (PDS). PDS' interface supports a customizable `proactive' mode through which clients can subscribe to be notified about changes to their objects of interest. Clients can dynamically tune the detail and granularity of these notifications through filter functions instantiated at the server or at the object's owner, and by remotely tuning the functionality of those filters. We compare PDS' performance against off-the-shelf implementations of DNS and the Lightweight Directory Access Protocol. Our evaluation results confirm the expected performance advantages of this approach and demonstrate that customized notification through filter functions can reduce bandwidth utilization while improving the performance of both clients and directory servers.", acknowledgement = ack-nhfb, } @InProceedings{Lee:2002:MDA, author = "Jason Lee and Dan Gunter and Martin Stoufer and Brian Tierney", title = "Monitoring Data Archives for Grid Environments", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap234.pdf", abstract = "Developers and users of high-performance distributed systems often observe performance problems such as unexpectedly low throughput or high latency. To determine the source of these performance problems, detailed end-to-end monitoring data from applications, networks, operating systems, and hardware must be correlated across time and space. Researchers need to be able to view and compare this very detailed monitoring data from a variety of angles. To address this problem, we propose a relational monitoring data archive that is designed to efficiently handle high-volume streams of monitoring data. In this paper we present an instrumentation and monitoring event archive service that can be used to collect and aggregate detailed end-to-end monitoring information from distributed applications. This archive service is designed to be scalable and fault tolerant. We also show how the archive is based on the ``Grid Monitoring Architecture''' defined by the Global Grid Forum.", acknowledgement = ack-nhfb, } @InProceedings{Mazzucco:2002:MMD, author = "Marco Mazzucco and Asvin Ananthanarayan and Robert L. Grossman and Jorge Levera and Gokulnath B. Rao", title = "Merging Multiple Data Streams on Common Keys Over High Performance Networks", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", URL = "http://www.sc-2002.org/paperpdfs/pap.pap213.pdf", abstract = "The model for data mining on streaming data assumes that there is a buffer of fixed length and a data stream of infinite length and the challenge is to extract patterns, changes, anomalies, and statistically significant structures by examining the data one time and storing records and derived attributes of length less than . As data grids, data webs, and semantic webs become more common, mining distributed streaming data will become more and more important. The first step when presented with two or more distributed streams is to merge them using a common key. In this paper, we present two algorithms for merging streaming data using a common key. We also present experimental studies showing these algorithms scale in practice to OC-12 networks.", acknowledgement = ack-nhfb, } %%% ==================================================================== %%% Cross-referenced entries must come last: @Proceedings{IEEE:2002:STI, editor = "{IEEE}", booktitle = "{SC2002}: From Terabytes to Insight. Proceedings of the {IEEE ACM SC 2002 Conference, November 16--22, 2002, Baltimore, MD, USA}", title = "{SC2002}: From Terabytes to Insight. Proceedings of the {IEEE ACM SC 2002 Conference, November 16--22, 2002, Baltimore, MD, USA}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "2002", ISBN = "0-7695-1524-X", ISBN-13 = "978-0-7695-1524-3", LCCN = "????", bibdate = "Thu Feb 21 18:29:36 2002", acknowledgement = ack-nhfb, }