BibTeX bibliography pvm.bib

%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "3.255",
%%%     date            = "31 August 2024",
%%%     time            = "15:18:35 MDT",
%%%     filename        = "pvm.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "39555 79325 350096 3709342",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "bibliography; BibTeX; CUDA (Compute Unified
%%%                        Device Architecture); MPI; Message Passing
%%%                        Interface; NVIDIA; OpenCL (Open Computing
%%%                        Language); OpenMP; PVM; Parallel Virtual
%%%                        Machine",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a bibliography of publications about
%%%                        PVM (Parallel Virtual Machine) software, and
%%%                        its close relative, MPI (Message Passing
%%%                        Interface).
%%%
%%%                        Publications about OpenMP are also included
%%%                        from version 2.00, because OpenMP directives
%%%                        for parallelization in a shared-memory
%%%                        environment are often combined with use of
%%%                        PVM or MPI across distributed-memory systems.
%%%
%%%                        Publications about NVIDIA's CUDA programming
%%%                        environment, and about OpenCL, are included
%%%                        from version 3.00.
%%%
%%%                        MPI and OpenMP publications may later be
%%%                        split off into a separate bibliography if
%%%                        they prove numerous enough.
%%%
%%%                        At version 3.255, the year coverage looked
%%%                        like this:
%%%
%%%                             1989 (   3)    2001 ( 145)    2013 (  68)
%%%                             1990 (   4)    2002 (  92)    2014 (  78)
%%%                             1991 (  14)    2003 (  50)    2015 (  80)
%%%                             1992 (  30)    2004 (  31)    2016 (  87)
%%%                             1993 (  99)    2005 (  65)    2017 (  99)
%%%                             1994 ( 196)    2006 (  32)    2018 (  73)
%%%                             1995 ( 251)    2007 (  44)    2019 ( 107)
%%%                             1996 ( 195)    2008 (  50)    2020 (  83)
%%%                             1997 ( 124)    2009 (  44)    2021 (  62)
%%%                             1998 (  87)    2010 (  66)    2022 (  20)
%%%                             1999 ( 116)    2011 (  62)
%%%                             2000 ( 123)    2012 ( 145)
%%%                             19xx (   2)
%%%
%%%                             Article:       1902
%%%                             Book:            37
%%%                             InCollection:     3
%%%                             InProceedings:  579
%%%                             Manual:           1
%%%                             MastersThesis:   16
%%%                             Misc:             9
%%%                             PhdThesis:        2
%%%                             Proceedings:    228
%%%                             TechReport:      50
%%%
%%%                             Total entries: 2827
%%%
%%%                        More information about PVM and MPI can be
%%%                        found on the World-Wide Web at these
%%%                        locations:
%%%
%%%                        http://www.math.utah.edu/pub/tex/bib/pvm.bib (this file)
%%%                        ftp://math.usfca.edu/pub/MPI/mpi.guide.ps
%%%                        http://lovelace.nas.nasa.gov/Parallel/SP2/MPIPerf/report.html
%%%                        http://www.arc.unm.edu/workshop/mpi/mpi.html
%%%                        http://www.epm.ornl.gov/~walker/mpi/SLIDES/mpi-tutorial.html
%%%                        http://www.mcs.anl.gov/mpi/index.html (MPI home page)
%%%                        http://www.netlib.org/utk/papers/intro-mpi/intro-mpi.html
%%%                        http://www.osc.edu/Lam/mpi/mpi_tut.html
%%%                        http://www.usi.utah.edu/user_guides/spug/
%%%                        news:comp.parallel.mpi
%%%                        http://www-unix.mcs.anl.gov/mpi/index.html
%%%
%%%                        The last of these contains pointers to
%%%                        online versions of the official MPI
%%%                        standards documents.
%%%
%%%                        This bibliography was collected from
%%%                        multiple sources:
%%%
%%%                        * the authors' own files;
%%%                        * the TeX User Group bibliography
%%%                          collection on ftp.math.utah.edu in
%%%                          /pub/tex/bib;
%%%                        * the very large Computer Science
%%%                          bibliography collection on ftp.ira.uka.de
%%%                          in /pub/bibliography, to which many people
%%%                          have contributed;
%%%                        * Internet library catalogs, including
%%%                          University of California MELVYL, Stanford
%%%                          University RLIN, Library of Congress,
%%%                          OCLC;
%%%                        * Zentralblatt fur Mathematik Mathematics
%%%                          Abstracts (http://www.emis.de/cgi-bin/MATH/);
%%%                        * the OCLC WorldCat, Contents1st, Article1st,
%%%                          Papers1st, and Proceedings databases;
%%%                        * the IEEE INSPEC databases
%%%                          (1989--1997);
%%%                        * the UnCover database.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted
%%%                        first by ascending year, and within each
%%%                        year, alphabetically by author or editor,
%%%                        and then, if necessary, by the 3-letter
%%%                        abbreviation at the end of the BibTeX
%%%                        citation tag, using the ``bibsort -byyear''
%%%                        utility.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================

@Preamble{
    "\hyphenation{
        Cor-vi
        Dough-er-ty
        Jo-seph
        Nov-em-ber
    }" #
    "\ifx \undefined \booktitle \def \booktitle#1{{{\em #1}}} \fi" #
    "\ifx \undefined \circled   \def \circled #1{(#1)}\fi" #
    "\ifx \undefined \booktitle \def \booktitle #1{{{\em #1}}} \fi" #
    "\ifx \undefined \reg       \def \reg {\circled{R}}\fi" #
    "\ifx \undefined \TM        \def \TM {${}^{\sc TM}$} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Institution abbreviations:

@String{inst-ANL-mcs            = "Mathematics and Computer Science
                                  Division, Argonne National Laboratory"}
@String{inst-ANL:adr            = "9700 South Cass Avenue, Argonne, IL
                                  60439-4801, USA"}

@String{inst-CERFACS            = "CERFACS"}
@String{inst-CERFACS:adr        = "Toulouse, France"}

@String{inst-EMORY              = "Emory University"}
@String{inst-EMORY:adr          = "Atlanta, GA, USA"}

@String{inst-IBM-WATSON         = "IBM T. J. Watson Research Center"}
@String{inst-IBM-WATSON:adr     = "Yorktown Heights, NY, USA"}

@String{inst-MSU                = "Mississippi State University"}
@String{inst-MSU:adr            = "Starkville, MS, USA"}

@String{inst-NLRC               = "NASA Langley Research Center"}
@String{inst-NLRC:adr           = "Hampton, VA, USA"}

@String{inst-ORNL               = "Oak Ridge National Laboratory"}
@String{inst-ORNL:adr           = "Knoxville, TN, USA"}

@String{inst-SCS-CMU            = "School of Computer Science,
                                  Carnegie Mellon University"}
@String{inst-SCS-CMU:adr        = "Pittsburgh, PA, USA"}

@String{inst-UAL-EE             = "Department of Electrical
                                  Engineering, University of Alabama"}
@String{inst-UAL-EE:adr         = "Tuscaloosa, AL, USA"}

@String{inst-UGA                = "University of Georgia"}
@String{inst-UGA:adr            = "Athens, GA, USA"}

@String{inst-UTK                = "University of Tennessee, Knoxville"}
@String{inst-UTK:adr            = "Knoxville, TN 37996, USA"}

@String{inst-UTK-CS             = "Department of Computer Science, University
                                  of Tennessee, Knoxville"}
@String{inst-UTK-CS:adr         = "Knoxville, TN 37996, USA"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-ACM-COMM-COMP-ALGEBRA = "ACM Communications in Computer Algebra"}

@String{j-ACM-J-EXP-ALGORITHMICS = "ACM Journal of Experimental Algorithmics"}

@String{j-ADV-COMPUT-MATH       = "Advances in computational mathematics"}

@String{j-ADV-WATER-RESOURCES   = "Advances in water resources"}

@String{j-AIAA-ASME-ASCE-AHS-STRUCT-STRUCT-DYN-MAT-CONF = "AIAA/ASME/ASCE/AHS
                                  Structures, Structural Dynamics \& Materials
                                  Conference --- Collection of Technical
                                  Papers"}

@String{j-ALGORITHMICA          = "Algorithmica"}

@String{j-ALGORITHMS-BASEL      = "Algorithms ({Basel})"}

@String{j-APPL-MATH-COMP        = "Applied Mathematics and Computation"}

@String{j-APPL-NUM-MATH         = "Applied Numerical Mathematics: Transactions
                                  of IMACS"}

@String{j-AUSTRALIAN-COMP-SCI-COMM = "Australian Computer Science
                                  Communications"}

@String{j-BIOMETRICS            = "Biometrics"}

@String{j-CACM                  = "Communications of the ACM"}

@String{j-CCPE                  = "Concurrency and Computation: Prac\-tice and
                                  Experience"}

@String{j-CGF                   = "Com{\-}pu{\-}ter Graphics Forum"}

@String{j-CHIN-J-COMPUTERS      = "Chinese Journal of Computers"}

@String{j-COMP-ARCH-NEWS        = "ACM SIGARCH Computer Architecture News"}

@String{j-COMP-ART-INTELL       = "Computers and Artificial Intelligence =
                                  Vychislitel'nye mashiny i
                                  iskusstvennyi intellekt"}

@String{j-COMP-CHEM-ENG         = "Computers \& Chemical Engineering"}

@String{j-COMP-ECONOMICS        = "Computational Economics"}

@String{j-COMP-J                = "The Computer Journal"}

@String{j-COMP-LANGS-SYS-STRUCT = "Computer Languages, Systems and Structures"}

@String{j-COMP-MECH             = "Computational mechanics"}

@String{j-COMP-NET-AMSTERDAM    = "Computer Networks (Amsterdam, Netherlands: 1999)"}

@String{j-COMP-PHYS-COMM        = "Computer Physics Communications"}

@String{j-COMP-STAT             = "Computational Statistics"}

@String{j-COMP-SURV             = "ACM Computing Surveys"}

@String{j-COMP-SYS              = "Computing systems: the journal of the
                                  USENIX Association"}

@String{j-COMPUT-MATH-APPL      = "Computers and Mathematics with Applications"}

@String{j-COMPUT-METH-APPL-MECH-ENG = "Computer Methods in Applied Mechanics
                                  and Engineering"}

@String{j-COMPUT-PHYS          = "Computers in Physics"}

@String{j-COMPUT-SCI-ENG        = "Computing in Science and Engineering"}

@String{j-COMPUT-SYST-ENG       = "Computing systems in engineering: an
                                  international journal"}

@String{j-COMPUTER              = "Computer"}

@String{j-COMPUTERS-AND-GRAPHICS = "Computers and Graphics"}

@String{j-COMPUTING             = "Computing"}

@String{j-CPE                   = "Concurrency: practice and experience"}

@String{j-CRAY-CHANNELS         = "CRAY Channels"}

@String{j-DEC-TECH-J            = "Digital Technical Journal of Digital
                                  Equipment Corporation"}

@String{j-DISCRETE-APPL-MATH    = "Discrete Applied Mathematics"}

@String{j-ELECT-LETTERS         = "Electronics Letters"}

@String{j-ENG-SCI-REP-KYUSHU    = "Engineering Sciences Reports, Kyushu
                                  University"}

@String{j-FORTRAN-FORUM         = "ACM Fortran Forum"}

@String{j-FRONTIERS-MASS-PAR-COMP-CONF-PROC = "Frontiers of Massively Parallel
                                  Computation --- Conference Proceedings"}

@String{j-FUT-GEN-COMP-SYS      = "Future Generation Computer Systems"}

@String{j-FUTURE-INTERNET       = "Future Internet"}

@String{j-HIGH-TECH-LETT        = "High Technology Letters"}

@String{j-HUMAN-HEREDITY        = "Human heredity"}

@String{j-IBM-JRD               = "IBM Journal of Research and Development"}

@String{j-IBM-SYS-J             = "IBM Systems Journal"}

@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}

@String{j-IEEE-COMPUT-SCI-ENG   = "IEEE Computational Science \& Engineering"}

@String{j-IEEE-CONCURR          = "IEEE Concurrency"}

@String{j-IEEE-DISTRIB-SYST-ONLINE = "IEEE Distributed Systems Online"}

@String{j-IEEE-INT-CONF-ALG-ARCH-PAR-PROC = "IEEE International Conference on
                                  Algorithms and Architectures for Parallel
                                  Processing"}

@String{j-IEEE-J-SEL-AREAS-COMMUN = "IEEE Journal on Selected Areas in
                                  Communications"}

@String{j-IEEE-MICRO            = "IEEE Micro"}

@String{j-IEEE-MICROW-GUIDED-WAVE-LETT = "IEEE Microwave and Guided Wave
                                  Letters"}

@String{j-IEEE-PAR-DIST-TECH    = "IEEE parallel and distributed technology:
                                  systems and applications"}

@String{j-IEEE-TRANS-COMPUT     = "IEEE Transactions on Computers"}

@String{j-IEEE-TRANS-PAR-DIST-SYS = "IEEE Transactions on Parallel and
                                  Distributed Systems"}

@String{j-IEEE-TRANS-SOFTW-ENG  = "IEEE Transactions on Software Engineering"}

@String{j-IEEE-TRANS-VIS-COMPUT-GRAPH = "IEEE Transactions on Visualization
                                   and Computer Graphics"}

@String{j-IFIP-TRANS-A          = "IFIP Transactions. A. Computer Science and
                                  Technology"}

@String{j-IJHPCA                = "The International Journal of High
                                  Performance Computing Applications"}

@String{j-IJQC                  = "International Journal of Quantum Chemistry"}

@String{j-IJSA                  = "The International Journal of Supercomputer
                                  Applications"}

@String{j-IJSAHPC               = "International Journal of Supercomputer
                                  Applications and High Performance Computing"}

@String{j-INFO-SOFTWARE-TECH    = "Information and Software Technology"}

@String{j-INFORMATICA           = "Informatica (Ljubljana, Slovenia)"}

@String{j-INT-J-COMPUT-APPL     = "International Journal of Computer
                                   Applications"}

@String{j-INT-J-COMPUT-SYST-SCI-ENG = "International Journal of Computer
                                  Systems Science and Engineering"}

@String{j-INT-J-HIGH-SPEED-COMPUTING = "International Journal of High Speed
                                  Computing"}

@String{j-INT-J-IMAGE-GRAPHICS = "International Journal of Image and Graphics
                                  (IJIG)"}

@String{j-INT-J-NUMER-METHODS-FLUIDS = "International Journal for Numerical
                                  Methods in Fluids"}

@String{j-INT-J-PAR-EMER-DIST-SYS = "International Journal of Parallel, Emergent
                                  and Distributed Systems: IJPEDS"}

@String{j-INT-J-PARALLEL-PROG   = "International Journal of Parallel
                                  Programming"}

@String{j-INTEL-TECH-J          = "Intel Technology Journal"}

@String{j-IT-IT                 = "Informationstechnik und technische
                                  Informatik: IT + TI"}

@String{j-J-APPL-ECONOMETRICS   = "Journal of Applied Econometrics"}

@String{j-J-APPL-PHYS           = "Journal of Applied Physics"}

@String{j-J-COMP-SCI-TECH       = "Journal of computer science and technology"}

@String{j-J-COMP-SYS-SCI        = "Journal of Computer and System Sciences"}

@String{j-J-COMPUT-APPL-MATH    = "Journal of Computational and Applied
                                  Mathematics"}

@String{j-J-COMPUT-BIOL         = "Journal of Computational Biology"}

@String{j-J-COMPUT-CHEM         = "Journal of Computational Chemistry"}

@String{j-J-COMPUT-PHYS         = "Journal of Computational Physics"}

@String{j-J-GRID-COMP           = "Journal of Grid Computing"}

@String{j-J-MOL-STRUCT-THEOCHEM = "Journal of molecular structure. Theochem"}

@String{j-J-OPEN-RES-SOFT       = "Journal of Open Research Software"}

@String{j-J-PAR-DIST-COMP       = "Journal of Parallel and Distributed
                                  Computing"}

@String{j-J-PHYS-IV-COLLOQUE    = "Journal de physique. IV, Colloque"}

@String{j-J-PROGRAM-LANG        = "Journal of Programming Languages"}

@String{j-J-SCI-COMPUT          = "Journal of Scientific Computing"}

@String{j-J-STAT-SOFT           = "Journal of Statistical Software"}

@String{j-J-SUPERCOMPUTING      = "The Journal of Supercomputing"}

@String{j-J-SYST-SOFTW          = "The Journal of Systems and Software"}

@String{j-J-UCS                 = "J.UCS: Journal of Universal Computer
                                  Science"}

@String{j-JETC                  = "ACM Journal on Emerging Technologies
                                  in Computing Systems (JETC)"}

@String{j-JOHO-SHORI            = "Joho-Shori (J. Information Processing Soc.
                                  Japan)"}

@String{j-LECT-NOTES-COMP-SCI   = "Lecture Notes in Computer Science"}

@String{j-LINUX-J               = "Linux Journal"}

@String{j-MICROCOMP-CIVIL-ENG   = "Microcomputers in Civil Engineering"}

@String{j-MICROPROC-MICROPROG   = "Microprocessing and Microprogramming"}

@String{j-MINI-MICRO-SYSTEMS    = "Mini-Micro Systems"}

@String{j-NETWORK-SECURITY      = "Network Security"}

@String{j-NEURAL-PAR-SCI-COMPUT = "Neural, Parallel and Scientific
                                  Computations"}

@String{j-NUCL-SCI-ENG          = "Nuclear Science and Engineering"}

@String{j-NUCLEAR-SAFETY        = "Nuclear safety"}

@String{j-NUMER-ALGORITHMS      = "Numerical Algorithms"}

@String{j-OPER-SYS-REV          = "Operating Systems Review"}

@String{j-PACMPL                = "Proceedings of the ACM on Programming
                                   Languages (PACMPL)"}

@String{j-PARALLEL-ALGORITHMS-APPL = "Parallel Algorithms and Applications"}

@String{j-PARALLEL-COMPUTING    = "Parallel Computing"}

@String{j-PARALLEL-DIST-COMP-PRACT = "Parallel and Distributed Computing
                                  Practices"}

@String{j-PARALLEL-PROCESS-LETT = "Parallel Processing Letters"}

@String{j-PARALLELOGRAM         = "Parallelogram"}

@String{j-POMACS                = "Proceedings of the ACM on Measurement and
                                   Analysis of Computing Systems (POMACS)"}

@String{j-PROC-INT-CONF-PAR-PROC = "Proceedings of the International
                                  Conference on Parallel Processing"}

@String{j-PROC-SPIE             = "Proceedings of the SPIE --- The
                                  International Society for Optical
                                  Engineering"}

@String{j-PROC-SUPERCOMPUT      = "Proceedings of the Supercomputing
                                  Conference"}

@String{j-PROC-VLDB-ENDOWMENT   = "Proceedings of the VLDB Endowment"}

@String{j-PROGRAMMIROVANIE      = "Programmirovanie"}

@String{j-QUEUE                 = "ACM Queue: Tomorrow's Computing Today"}

@String{j-R-JOURNAL             = "The R Journal"}

@String{j-R-NEWS                = "R News: the Newsletter of the R Project"}

@String{j-REAL-TIME-IMAGING     = "Real-Time Imaging"}

@String{j-SCI-COMPUT-PROGRAM    = "Science of Computer Programming"}

@String{j-SCI-PROG              = "Scientific Programming"}

@String{j-SCPE                  = "Scalable Computing: Practice and
                                  Experience"}

@String{j-SIAM-J-OPT            = "SIAM Journal on Optimization"}

@String{j-SIAM-J-SCI-COMP       = "SIAM Journal on Scientific Computing"}

@String{j-SIAM-NEWS             = "SIAM News"}

@String{j-SIGADA-LETTERS        = "ACM SIGADA Ada Letters"}

@String{j-SIGCSE                = "SIGCSE Bulletin (ACM Special Interest Group
                                  on Computer Science Education)"}

@String{j-SIGMETRICS            = "ACM SIGMETRICS Performance Evaluation
                                  Review"}

@String{j-SIGMOD                = "SIGMOD Record (ACM Special Interest
                                  Group on Management of Data)"}

@String{j-SIGNAL-PROCESS-IMAGE-COMMUN = "Signal Processing: Image
                                  Communication"}

@String{j-SIGPLAN               = "ACM SIG{\-}PLAN Notices"}

@String{j-SIGSAM                = "SIGSAM Bulletin (ACM Special Interest Group
                                  on Symbolic and Algebraic Manipulation)"}

@String{j-SIGSOFT               = "ACM SIGSOFT Software Engineering Notes"}

@String{j-SIM-MODEL-PRACT-THEORY = "Simulation Modelling Practice and Theory"}

@String{j-SOFTWAREX             = "SoftwareX"}

@String{j-SPE                   = "Soft{\-}ware\emdash Prac{\-}tice and
                                  Experience"}

@String{j-STAT-COMPUT           = "Statistics and Computing"}

@String{j-SUPERCOMPUTER         = "Supercomputer"}

@String{j-SUPERFRI              = "Supercomputing Frontiers and Innovations"}

@String{j-TACO                  = "ACM Transactions on Architecture and
                                  Code Optimization"}

@String{j-TCBB                  = "IEEE/ACM Transactions on Computational
                                  Biology and Bioinformatics"}

@String{j-TECS                  = "ACM Transactions on Embedded Computing
                                  Systems"}

@String{j-TKDD                  = "ACM Transactions on Knowledge
                                  Discovery from Data (TKDD)"}

@String{j-TOCE                  = "ACM Transactions on Computing Education"}

@String{j-TOCL                  = "ACM Transactions on Computational Logic"}

@String{j-TOCS                  = "ACM Transactions on Computer Systems"}

@String{j-TODAES                = "ACM Transactions on Design Automation of
                                  Electronic Systems"}

@String{j-TOG                   = "ACM Transactions on Graphics"}

@String{j-TOMACS                = "ACM Transactions on Modeling and
                                  Computer Simulation"}

@String{j-TOMCCAP               = "ACM Transactions on Multimedia Computing,
                                  Communications, and Applications"}

@String{j-TOMPECS               = "ACM Transactions on Modeling and Performance
                                  Evaluation of Computing Systems (TOMPECS)"}

@String{j-TOMS                  = "ACM Transactions on Mathematical Software"}

@String{j-TOPC                  = "ACM Transactions on Parallel Computing
                                  (TOPC)"}

@String{j-TOPLAS                = "ACM Transactions on Programming Languages
                                  and Systems"}

@String{j-TOSEM                 = "ACM Transactions on Software Engineering
                                  and Methodology"}

@String{j-TRANS-AM-NUCL-SOC     = "Transactions of the American Nuclear
                                  Society"}

@String{j-TRANS-INFO-PROCESSING-SOC-JAPAN = "Transactions of the Information
                                  Processing Society of Japan"}

@String{j-TRETS                 = "ACM Transactions on Reconfigurable Technology
                                  and Systems (TRETS)"}

@String{j-TSAS                  = "ACM Transactions on Spatial Algorithms and
                                  Systems (TSAS)"}

@String{j-VLDB-J                = "VLDB Journal: Very Large Data Bases"}

%%% ====================================================================
%%% Publisher abbreviations:

@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-AIP                 = "American Institute of Physics"}
@String{pub-AIP:adr             = "Woodbury, NY, USA"}

@String{pub-ASME                = "American Society Mech. Engineers"}
@String{pub-ASME:adr            = "United Engineering Center, 345 E. 47th St.,
                                  New York, NY 10017, USA"}

@String{pub-AW                  = "Ad{\-d}i{\-s}on-Wes{\-l}ey"}
@String{pub-AW:adr              = "Reading, MA, USA"}

@String{pub-BIRKHAUSER          = "Birkh{\"a}user"}
@String{pub-BIRKHAUSER:adr      = "Cambridge, MA, USA; Berlin, Germany; Basel,
                                  Switzerland"}

@String{pub-CAMBRIDGE           = "Cambridge University Press"}
@String{pub-CAMBRIDGE:adr       = "Cambridge, UK"}

@String{pub-CHAPMAN-HALL        = "Chapman and Hall, Ltd."}
@String{pub-CHAPMAN-HALL:adr    = "London, UK"}

@String{pub-CHAPMAN-HALL-CRC    = "Chapman and Hall/CRC"}
@String{pub-CHAPMAN-HALL-CRC:adr = "Boca Raton, FL, USA"}

@String{pub-CRC                 = "CRC Press"}
@String{pub-CRC:adr             = "2000 N.W. Corporate Blvd., Boca Raton,
                                  FL 33431-9868, USA"}

@String{pub-ELS                 = "Elsevier"}
@String{pub-ELS:adr             = "Amsterdam, The Netherlands"}

@String{pub-ELSAS               = "Elsevier Applied Science"}
@String{pub-ELSAS:adr           = "London, UK"}

@String{pub-IEEE                = "IEEE Computer Society Press"}
@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300, Silver
                                  Spring,
                                  MD 20910, USA"}

@String{pub-IOS                 = "IOS Press"}
@String{pub-IOS:adr             = "Postal Drawer 10558, Burke, VA
                                  2209-0558, USA"}

@String{pub-KLUWER              = "Kluwer Academic Publishers Group"}
@String{pub-KLUWER:adr          = "Norwell, MA, USA, and Dordrecht,
                                  The Netherlands"}

@String{pub-MCGRAW-HILL         = "Mc{\-}Graw-Hill"}
@String{pub-MCGRAW-HILL:adr     = "New York, NY, USA"}

@String{pub-MIT                 = "MIT Press"}
@String{pub-MIT:adr             = "Cambridge, MA, USA"}

@String{pub-MORGAN-KAUFMANN     = "Morgan Kaufmann Publishers"}
@String{pub-MORGAN-KAUFMANN:adr = "Los Altos, CA 94022, USA"}
@String{pub-MORGAN-KAUFMANN:adrnew = "2929 Campus Drive, Suite 260, San
                                  Mateo, CA 94403, USA"}

@String{pub-NASA                = "National Aeronautics and Space
                                  Administration"}
@String{pub-NASA:adr            = "Washington, DC, USA"}

@String{pub-NH                  = "North-Hol{\-}land"}
@String{pub-NH:adr              = "Amsterdam, The Netherlands"}

@String{pub-NTIS                = "National Technical Information Service"}
@String{pub-NTIS:adr            = "Washington, DC, USA"}

@String{pub-ORA                 = "O'Reilly \& {Associates, Inc.}"}
@String{pub-ORA:adr             = "981 Chestnut Street, Newton, MA 02164, USA"}

@String{pub-OXFORD              = "Oxford University Press"}
@String{pub-OXFORD:adr          = "Walton Street, Oxford OX2 6DP, UK"}

@String{pub-PHI                 = "Pren{\-}tice-Hall International"}
@String{pub-PHI:adr             = "Englewood Cliffs, NJ 07632, USA"}

@String{pub-PLENUM              = "Plenum Press"}
@String{pub-PLENUM:adr          = "New York, NY, USA"}

@String{pub-SCRI                = "Supercomputing Computations
                                  Research Institute, Florida State
                                  University"}
@String{pub-SCRI:adr            = "Tallahassee, FL, USA"}

@String{pub-SIAM                = "Society for Industrial and Applied
                                  Mathematics"}
@String{pub-SIAM:adr            = "Philadelphia, PA, USA"}

@String{pub-SPE                 = "Society of Petroleum Engineers"}
@String{pub-SPE:adr             = "Richardson, TX, USA"}

@String{pub-SPIE                = "Society of Photo-optical
                                  Instrumentation Engineers (SPIE)"}
@String{pub-SPIE:adr            = "Bellingham, WA, USA"}

@String{pub-SUN-MICROSYSTEMS-PRESS = "Sun Microsystems Press"}
@String{pub-SUN-MICROSYSTEMS-PRESS:adr = "Palo Alto, CA, USA"}

@String{pub-SV                  = "Spring{\-}er-Ver{\-}lag"}
@String{pub-SV:adr              = "Berlin, Germany~/ Heidelberg,
                                  Germany~/ London, UK~/ etc."}

@String{pub-USENIX              = "USENIX"}
@String{pub-USENIX:adr          = "Berkeley, CA, USA"}

@String{pub-WILEY               = "Wiley"}
@String{pub-WILEY:adr           = "New York, NY, USA"}

@String{pub-WORLD-SCI           = "World Scientific Publishing
                                  Co. Pte. Ltd."}
@String{pub-WORLD-SCI:adr       = "P. O. Box 128, Farrer Road,
                                  Singapore 9128"}

%%% ====================================================================
%%% Series abbreviations:

@String{ser-LNAI                = "Lecture Notes in Artificial Intelligence"}

@String{ser-LNCS                = "Lecture Notes in Computer Science"}

@String{ser-LNCSE               = "Lecture Notes in Computational
                                   Science and Engineering"}

%%% ====================================================================
%%% Bibliography entries, sorted by year and then by citation label,
%%% with `bibsort -byyear':

@Article{Abrossimov:1989:GVM,
  author =       "V. Abrossimov and M. Rozier and M. Shapiro",
  title =        "Generic virtual memory management for operating system
                 kernels",
  journal =      j-OPER-SYS-REV,
  volume =       "23",
  number =       "5",
  pages =        "123--136",
  year =         "1989",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sun Dec 22 10:16:35 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Chorus Systemes, Saint-Quentin-en-Yvelines, France",
  classification = "C6120 (File organisation); C6150J (Operating
                 systems)",
  fjournal =     "Operating Systems Review",
  keywords =     "Chorus Nucleus; Consistent cache; Data caching;
                 Deferred copying; Explicit I/O; Generic Memory
                 management Interface; History object technique; Mapped
                 objects; Operating system kernel; Paged architectures;
                 Paged Virtual Memory manager; PVM; Real memory; Unix",
  thesaurus =    "Buffer storage; Operating systems [computers]; Virtual
                 storage",
}

@InProceedings{Poplawski:1989:MPP,
  author =       "D. A. Poplawski and S. Pahwa and J. M. Francioni",
  title =        "Models of parallel program behavior",
  crossref =     "Anonymous:1989:PFC",
  pages =        "857--860 (vol. 2)",
  year =         "1989",
  bibdate =      "Sun Dec 22 10:16:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan Technol. Univ.,
                 Houghton, MI, USA",
  classification = "C4240 (Programming and algorithm theory); C6110
                 (Systems analysis and programming); C6120 (File
                 organisation)",
  keywords =     "Distributed memory MIMD; Hypercube programs; Parallel
                 program behavior; Parallel virtual memory; PVM",
  thesaurus =    "Hypercube networks; Parallel programming; Virtual
                 storage",
}

@InProceedings{Feeley:1990:PVM,
  author =       "Marc Feeley and James S. Miller",
  booktitle =    "{Proceedings of the 1990 ACM Conference on LISP and
                 Functional Programming, Nice}",
  title =        "A parallel virtual machine for efficient {Scheme}
                 compilation",
  crossref =     "ACM:1990:PAC",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  bookpages =    "????",
  pages =        "119--130",
  month =        jun,
  year =         "1990",
  bibdate =      "Wed Jan 24 04:51:56 MST 2001",
  bibsource =    "http://dblp.uni-trier.de/db/conf/lfp/lfp1990.html#FeeleyM90;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/lfp/91556/p119-feeley/",
  abstract =     "Programs compiled by Gambit, our Scheme compiler,
                 achieve performance as much as twice that of the
                 fastest available Scheme compilers. Gambit is easily
                 ported, while retaining its high performance, through
                 the use of a simple virtual machine (PVM). PVM allows a
                 wide variety of machine-independent optimizations and
                 it supports parallel computation based on the future
                 construct. PVM conveys high-level information
                 bidirectionally between the machine-independent front
                 end of the compiler and the machine-dependent back end,
                 making it easy to implement a number of common back end
                 optimizations that are difficult to achieve for other
                 virtual machines. PVM is similar to many real computer
                 architectures and has an option to efficiently gather
                 dynamic measurements of virtual machine usage. These
                 measurements can be used in performance prediction for
                 ports to other architectures as well as design
                 decisions related to proposed optimizations and object
                 representations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Brandeis Univ., Waltham, MA, USA",
  classification = "C6150C (Compilers, interpreters and other
                 processors)",
  conflocation = "Nice, France; 27--29 June 1990",
  corpsource =   "Brandeis Univ., Waltham, MA, USA",
  keywords =     "Gambit; Lisp; machine-independent; Machine-independent
                 front end; machine-independent front end;
                 Machine-independent optimizations; object; Object
                 representations; optimizations; parallel processing;
                 Parallel virtual machine; parallel virtual machine;
                 portability; program compilers; PVM portability;
                 representations; Scheme compiler; simple virtual
                 machine; Simple virtual machine; software; virtual
                 machines",
  oldlabel =     "FeeleyM90",
  sponsororg =   "ACM",
  thesaurus =    "Parallel processing; Program compilers; Software
                 portability; Virtual machines",
  treatment =    "P Practical",
  XMLdata =      "ftp://ftp.informatik.uni-trier.de/pub/users/Ley/bib/records.tar.gz#conf/lfp/FeeleyM90",
}

@TechReport{Sunderam:1990:PFPa,
  author =       "V. S. Sunderam",
  title =        "{PVM}: a Framework for Parallel Distributed
                 Computing",
  number =       "ORNL/TM-11375",
  institution =  "Dept. of Math and Computer Science, " # inst-EMORY,
  address =      inst-EMORY:adr,
  month =        feb,
  year =         "1990",
  bibsource =    "Distributed/dist.sys.1.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See also \cite{Sunderam:1990:PFPb}.",
  comment =      "Good overview of PVM, though now a little out of date.
                 Supports dynamic, location-transparent, process
                 initiation, typed message passing and shared memory,
                 broadcast and distributed synchronization, and
                 heterogeneity in the form of language- and
                 machine-independence, type conversion, and multiple
                 executables for each component. Seems to be heavily
                 dependent on broadcast. Shared memory is somewhat
                 limited. See also beguelin:concsuper. [David.Kotz at
                 Dartmouth.edu]",
  keyword =      "heterogeneous computing, distributed computing,
                 network parallel computing",
}

@Article{Sunderam:1990:PFPb,
  author =       "V. S. Sunderam",
  title =        "{PVM}: a Framework for Parallel Distributed
                 Computing",
  journal =      j-CPE,
  volume =       "2",
  number =       "4",
  pages =        "315--339",
  month =        dec,
  year =         "1990",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 05:40:19 MDT 1999",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Misc/IMMD_IV.bib",
  note =         "See also the earlier technical report
                 \cite{Sunderam:1990:PFPa}.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  classification = "C6115 (Programming support)",
  corpsource =   "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "algorithms; Algorithms; concurrent; Concurrent;
                 conditional execution; Conditional execution;
                 distributed processing; environment; environments;
                 error detection; Error detection; interface; Interface;
                 parallel distributed computing; Parallel distributed
                 computing; parallel programming; programming;
                 Programming environment; PVM system; sequential;
                 Sequential; virtual computing; Virtual computing
                 environment",
  pubcountry =   "UK",
  thesaurus =    "Distributed processing; Parallel programming;
                 Programming environments",
  treatment =    "P Practical",
}

@Article{Balou:1991:DIV,
  author =       "A. T. Balou and A. N. Refenes",
  title =        "The design and implementation of {VOOM}: a parallel
                 virtual object oriented machine",
  journal =      j-MICROPROC-MICROPROG,
  volume =       "32",
  number =       "1-5",
  pages =        "289--296",
  month =        aug,
  year =         "1991",
  CODEN =        "MMICDT",
  ISSN =         "0165-6074 (print), 1878-7061 (electronic)",
  ISSN-L =       "0165-6074",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220 (Computer architecture); C6110 (Systems
                 analysis and programming); C7430 (Computer
                 engineering)",
  conflocation = "Vienna, Austria; 2-5 Sept. 1991",
  conftitle =    "17th EUROMICRO Symposium on Microprocessing and
                 Microprogramming. Hardware and Software Design
                 Automation",
  corpsource =   "Dept. of Comput. Sci., Univ. Coll. London, UK",
  fjournal =     "Microprocessing and Microprogramming",
  keywords =     "design; execution unit; implementation; machine;
                 machines; memory management unit; memory recycling;
                 object management; object-oriented model;
                 object-oriented programming; packet-switching network;
                 parallel architecture; parallel architectures; parallel
                 virtual object oriented; pre-fetch unit; virtual",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@InProceedings{Beguelin:1991:GDT,
  author =       "Adam Beguelin and Jack J. Dongarra and A. Geist and
                 Robert Manchek and V. S. Sunderam",
  title =        "Graphical Development Tools for Network-Based
                 Concurrent Supercomputing",
  crossref =     "IEEE:1991:PSA",
  pages =        "435--444",
  year =         "1991",
  bibdate =      "Sun Dec 22 10:17:16 MST 1996",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat Lab., TN, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6180G (Graphical user interfaces)",
  comment =      "Clusters of workstations solving supercomputing
                 problems. This is a graphical front-end to PVM, that
                 allows the user to specify a set of subroutines, their
                 parameters and output values, and the dependencies
                 between them. It can compile the parts on multiple
                 machines. At run time it chooses where to execute each
                 module, and when, based on the dependencies and on a
                 user-supplied cost matrix showing the cost of running
                 each module in each place. See also beguelin:hence.
                 [David.Kotz at Dartmouth.edu]",
  keyword =      "network supercomputing, distributed computing",
  keywords =     "Application program; Graphical development tools;
                 HeNCE; Heterogeneous network computing environment;
                 Integrated graphical tools; Network-based concurrent
                 supercomputing; Parallel programs; Parallel Virtual
                 Machine; Process management and communication; PVM;
                 Software package; X-window-based software environment",
  thesaurus =    "Graphical user interfaces; Parallel programming;
                 Programming environments",
}

@TechReport{Beguelin:1991:UGP,
  author =       "A. Beguelin and J. Dongarra and A. Geist and R.
                 Manchek and V. Sunderam",
  title =        "A User's guide to {PVM}: Parallel virtual machine",
  type =         "Technical Report",
  number =       "ORNL/TM-11826",
  institution =  "Mathematical Sciences Section, Oak Ridge National
                 Laboratory",
  address =      inst-ORNL:adr,
  month =        sep,
  year =         "1991",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/par.lin.alg.bib; Theory/Matrix.bib",
  keywords =     "prll, operating system",
}

@InProceedings{Benzoni:1991:MFR,
  author =       "A. Benzoni and V. S. Sunderam and R. van de Guijn",
  title =        "Matrix factorization on a {RISC} workstation network",
  crossref =     "Durand:1991:HPC",
  pages =        "207--218",
  year =         "1991",
  bibdate =      "Sun Dec 22 10:17:16 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM ECSEC, Roma, Italy",
  classification = "C4140 (Linear algebra); C5220 (Computer
                 architecture); C5470 (Performance evaluation and
                 testing); C5620L (Local area networks)",
  keywords =     "20 To 60 MFLOPS; Concurrent process management; Dense
                 matrix; Distributed memory architecture; Distributed
                 programming environment; Ethernet; Heterogeneous
                 distributed computing environment; High-speed network;
                 Independent processing units; LU factorization;
                 Numerically intensive applications; Optical fiber link;
                 PVM; RISC System/6000 workstations; RISC workstation
                 network; Synchronization; Token Ring local area
                 network",
  numericalindex = "Computer speed 2.0E+07 to 6.0E+07 FLOPS",
  thesaurus =    "Distributed processing; Local area networks; Matrix
                 algebra; Optical links; Performance evaluation",
}

@Manual{Dongarra:1991:UGP,
  author =       "Jack Dongarra and others",
  title =        "A Users' Guide to {PVM} Parallel Virtual Machine",
  organization = inst-ORNL,
  address =      inst-ORNL:adr,
  month =        Jul,
  year =         "1991",
  bibsource =    "Distributed/Dist.Sys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@TechReport{Geist:1991:ENB,
  author =       "G. A. Geist and V. S. Sunderam",
  title =        "Experiences with network based concurrent computing on
                 the {PVM} system",
  number =       "ORNL/TM-11760",
  institution =  inst-ORNL,
  address =      inst-ORNL:adr,
  month =        jan,
  year =         "1991",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Geist:1991:PSS,
  author =       "G. A. Geist and V. S. Sunderam",
  title =        "The {PVM} System: {Supercomputer} Level Concurrent
                 Computation on a Heterogeneous Network of
                 Workstations",
  crossref =     "Stout:1991:SDM",
  pages =        "258--261",
  year =         "1991",
  bibsource =    "Distributed/dist.sys.1.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  comment =      "A more up-to-date, but shorter, overview of PVM and
                 its performance than sunderam:pvm. Good performance on
                 networks of IBM RS/6000s. [David.Kotz at
                 Dartmouth.edu]",
  keyword =      "distributed heterogeneous computing",
}

@Article{Meleshchuk:1991:IPP,
  author =       "S. B. Meleshchuk and A. N. Nedumov",
  title =        "Implementation of a protocol for parallel database
                 access with virtual machine communications facilities",
  journal =      j-PROGRAMMIROVANIE,
  volume =       "17",
  number =       "1",
  pages =        "35--42",
  month =        jan # "\slash " # feb,
  year =         "1991",
  CODEN =        "PCSODA",
  ISSN =         "0132-3474, 0361-7688",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "English translation in Programming and Computer
                 Software, vol. 17, no. 1, pp. 27--32, November 1991.",
  acknowledgement = ack-nhfb,
  classification = "C6150J (Operating systems)",
  corpsource =   "Leningrad Techn. State Univ., USSR",
  fjournal =     "Programmirovanie",
  keywords =     "COMMIT protocol; concurrency control; deadlock;
                 electronic mail; interrupts; IUCV mail facility;
                 machines; parallel; parallel database access;
                 processing; protocol; protocols; virtual; virtual
                 machines",
  pubcountry =   "USSR",
  treatment =    "P Practical",
}

@InProceedings{Nagaraj:1991:MHL,
  author =       "U. Nagaraj and U. S. Shukla",
  title =        "{MK}: a high level interface for message passing",
  crossref =     "Bhavsar:1991:SSJ",
  pages =        "493--502",
  year =         "1991",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150J (Operating systems)",
  corpsource =   "Centre for Dev. of Adv. Comput., Bangalore, India",
  keywords =     "communication interface; high level interface;
                 interconnection network technology; message passing
                 multicomputer; MK; network operating systems;
                 programming environment; software interfaces;
                 transputer network",
  treatment =    "P Practical",
}

@Article{Saltz:1991:MRT,
  author =       "J. Saltz and H. Berryman and J. Wu",
  title =        "Multiprocessors and Run-time Compilation",
  journal =      j-CPE,
  volume =       "3",
  number =       "6",
  pages =        "573--592",
  month =        dec,
  year =         "1991",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 05:40:19 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@MastersThesis{Al-Salman:1992:DIP,
  author =       "Abdulmalik Salman Al-Salman",
  title =        "Design and implementation of a profiler for the
                 Parallel Virtual Machine ({PVM}) system",
  type =         "M.S. thesis",
  school =       inst-UGA,
  address =      inst-UGA:adr,
  pages =        "vi + 51",
  year =         "1992",
  bibdate =      "Mon Jan 15 16:37:21 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Directed by Steven C. Cater.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Alfano:1992:DNA,
  author =       "M. Alfano and G. {Lo Re}",
  title =        "Distributing numerical algorithms: some experiences
                 with network computing system ({NCS}) and parallel
                 virtual machine ({PVM})",
  crossref =     "SCRI:1992:PWC",
  year =         "1992",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Beguelin:1992:HGD,
  author =       "A. Beguelin and J. Dongarra and A. Geist and R.
                 Manchek and K. Moore and R. Wade and V. Sunderam",
  title =        "{HeNCE}: graphical development tools for network-based
                 concurrent computing",
  crossref =     "IEEE:1992:PSH",
  pages =        "129--136",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat. Lab., TN, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6130B (Graphics techniques); C6150C
                 (Compilers, interpreters and other processors); C6180G
                 (Graphical user interfaces)",
  keywords =     "Distributed virtual computer; Graphical development
                 tools; Graphical interface; Graphical parallel
                 programming environment; HeNCE; Heterogeneous machines;
                 Heterogeneous network computing environment;
                 Network-based concurrent computing; Program compiler;
                 Program debugging; PVM; Unix workstation; X Window",
  thesaurus =    "Graphical user interfaces; Parallel programming;
                 Program compilers; Program debugging; Programming
                 environments; Software tools",
}

@Article{Beguelin:1992:PHT,
  author =       "A. Beguelin and J. Dongarra and A. Geist and R.
                 Manchek and V. Sunderam",
  title =        "{PVM} and {HeNCE}: traversing the parallel
                 environment",
  journal =      j-CRAY-CHANNELS,
  volume =       "14",
  number =       "4",
  pages =        "22--25",
  month =        "Fall",
  year =         "1992",
  CODEN =        "CRCHE8",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Carnegie-Mellon Univ., Pittsburgh, PA, USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems); C7430 (Computer engineering)",
  corpsource =   "Carnegie-Mellon Univ., Pittsburgh, PA, USA",
  fjournal =     "CRAY Channels",
  keywords =     "cost-effective use; Cost-effective use; Cray Research
                 MPP systems; diverse architectures; Diverse
                 architectures; diverse computer systems; Diverse
                 computer systems; HeNCE; Heterogeneous Network
                 Computing Environment; heterogeneous networks;
                 Heterogeneous networks; Machine; network operating
                 systems; networked resources; Networked resources;
                 packages; parallel; parallel machines; Parallel
                 Virtual; Parallel Virtual Machine; portability;
                 Portability; programming; PVM; software; software
                 packages; Software packages; virtual machines",
  thesaurus =    "Network operating systems; Parallel machines; Parallel
                 programming; Software packages; Virtual machines",
  treatment =    "P Practical; R Product Review",
}

@InProceedings{Beguelin:1992:SCG,
  author =       "A. Beguelin and J. Dongarra and A. Geist and R.
                 Manchek and V. Sunderam",
  title =        "Solving computational grand challenges using a network
                 of heterogeneous supercomputers",
  crossref =     "Dongarra:1992:PFS",
  pages =        "596--601",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat. Lab., Tennessee Univ., Knoxville, TN,
                 USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C5620W (Other networks); C6110P (Parallel programming);
                 C6115 (Programming support); C7430 (Computer
                 engineering)",
  keywords =     "Computational grand challenges; Cray XMP; Flexibility;
                 High speed network; Intel iPSC/860; Network of
                 heterogeneous supercomputers; Parallel virtual machine;
                 Thinking Machines CM2; Virtual computer",
  thesaurus =    "Parallel processing; Parallel programming; Programming
                 environments; Virtual machines; Wide area networks",
}

@TechReport{Beguelin:1992:XTM,
  author =       "Adam Louis Beguelin",
  title =        "Xab: a tool for monitoring {PVM} programs",
  institution =  inst-SCS-CMU,
  address =      inst-SCS-CMU:adr,
  day =          "5",
  month =        jun,
  year =         "1992",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Benzoni:1992:CLF,
  author =       "A. Benzoni and G. Richelli and V. S. Sunderam",
  title =        "Concurrent {LU} factorization on workstation
                 networks",
  crossref =     "Evans:1992:PCP",
  pages =        "159--166",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:16 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM ECSEC, Roma, Italy",
  classification = "B0290H (Linear algebra); B6210L (Computer
                 communications); B6260 (Optical links and equipment);
                 C4140 (Linear algebra); C4240P (Parallel programming
                 and algorithm theory); C5620L (Local area networks)",
  keywords =     "6 MByte/s; Concurrent LU factorization; Dense matrix;
                 Ethernet network; Fiber optic links; IBM RISC
                 System/6000 workstations; Optical fiber links; PVM
                 software system; Workstation networks",
  numericalindex = "Byte rate 6.0E+06 Byte/s",
  thesaurus =    "Local area networks; Matrix algebra; Optical links;
                 Parallel algorithms; Workstations",
}

@TechReport{Dongarra:1992:PUL,
  author =       "Jack J. Dongarra and Rolf Hempel and Anthony J. G. Hey
                 and David W. Walker",
  title =        "A Proposal for a User-Level Message-Passing Interface
                 in a Distributed Memory Environment",
  type =         "Technical Report",
  number =       "TM-12231",
  institution =  inst-ORNL,
  address =      inst-ORNL:adr,
  month =        oct,
  year =         "1992",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Parallel/Par.Arch.Indep.bib;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Duval:1992:TPP,
  author =       "D. Duval",
  title =        "Trends in parallel programming models for high
                 performance computers",
  crossref =     "Ferenczi:1992:AHW",
  pages =        "33",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Parallel Comput. Div., TELMAT Inf., Soultz, France",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming)",
  keywords =     "CS-Tools; F90; Heterogeneous scalable networks; High
                 Performance Fortran; Massively parallel machines;
                 Neural coprocessor; Parallel programming models;
                 PARMACS; PVM; Scientific applications; SHAPES ASI;
                 SPMD; Superscalar; Transputers; Vector facilities",
  thesaurus =    "Parallel programming; Programming theory; Software
                 engineering",
}

@InProceedings{Eppstein:1992:PGC,
  author =       "Margaret J. Eppstein and Joseph F. Guarnaccia and
                 David Emery Dougherty and Robert S. Kerr",
  title =        "Parallel groundwater computations using {PVM}",
  crossref =     "Russell:1992:CMW",
  pages =        "713--720",
  year =         "1992",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  annote =       "Caption title. Published in Computational methods in
                 water resources IX, vol. 1, Numerical methods in water
                 resources. EPA/600/A-92/157 PB92-206572. Microfiche.
                 Springfield, VA: National Technical Information
                 Service, [1992]. 1 microfiche: negative.",
  keywords =     "Groundwater flow --- Computer programs",
}

@Book{Freeman:1992:PNA,
  author =       "T. L. (Len) Freeman and C. (Christopher) Phillips",
  title =        "Parallel numerical algorithms",
  publisher =    pub-PHI,
  address =      pub-PHI:adr,
  pages =        "xii + 315",
  year =         "1992",
  ISBN =         "0-13-651597-5",
  ISBN-13 =      "978-0-13-651597-5",
  LCCN =         "QA76.9.A43 F74 1992",
  bibdate =      "Mon Oct 07 09:13:23 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Chapter 5 discusses HPF and PVM.",
  price =        "US\$40.00",
  series =       "Prentice Hall International Series in Computer
                 Science",
  acknowledgement = ack-nhfb,
}

@Article{Geist:1992:NBC,
  author =       "G. A. Geist and V. S. Sunderam",
  title =        "Network-based Concurrent Computing on the {PVM}
                 System",
  journal =      j-CPE,
  volume =       "4",
  number =       "4",
  pages =        "293--312 (or 293--311??)",
  month =        jun,
  year =         "1992",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6180G (Graphical user interfaces); C7430 (Computer
                 engineering)",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "Computational resource; computational resource;
                 Concurrent computing environment; concurrent computing
                 environment; coupled networks; graphical interface;
                 graphical user interfaces; interactive; Interactive
                 graphical interface; loosely; Loosely coupled networks;
                 machines; Multiprocessing; multiprocessing; parallel
                 processing; Parallel Virtual Machine; Performance;
                 performance; Porting; porting; PVM system; Software
                 package; software package; virtual",
  pubcountry =   "UK",
  thesaurus =    "Graphical user interfaces; Parallel processing;
                 Virtual machines",
  treatment =    "P Practical",
}

@TechReport{Gropp:1992:TIM,
  author =       "Bill Gropp and Ewing Lusk",
  title =        "A test implementation of the {MPI} draft
                 message-passing standard",
  institution =  inst-ANL-mcs,
  address =      inst-ANL:adr,
  year =         "1992",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Leon:1992:FP,
  author =       "Juan Leon and Allan L. Fisher and Peter Steenkiste",
  title =        "Fail-safe {PVM}",
  crossref =     "SCRI:1992:PWC",
  year =         "1992",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@Article{Majumdar:1992:PPC,
  author =       "A. Majumdar and W. R. Martin",
  title =        "Parallel preconditioned conjugate gradient algorithm
                 applied to neutron diffusion problem",
  journal =      j-TRANS-AM-NUCL-SOC,
  volume =       "65",
  pages =        "209--210",
  year =         "1992",
  CODEN =        "TANSAO",
  ISSN =         "0003-018X",
  bibdate =      "Sun Dec 22 10:17:16 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Michigan Univ., Ann Arbor, MI, USA",
  classification = "A0260 (Numerical approximation and analysis); A2820H
                 (Neutron diffusion); A2841C (Computer codes); C4130
                 (Interpolation and function approximation); C4240P
                 (Parallel programming and algorithm theory); C7470
                 (Nuclear engineering)",
  fjournal =     "Transactions of the American Nuclear Society",
  keywords =     "BBN TC2000; Distributed workstation; IBM RS6000;
                 Iterative method; Linear system; Neutron diffusion;
                 Parallel PCG algorithm; Parallel virtual machine;
                 Parallelization software; Preconditioned conjugate
                 gradient; Shared memory machine",
  thesaurus =    "Iterative methods; Neutron diffusion; Nuclear
                 engineering computing; Parallel algorithms",
}

@InProceedings{McRae:1992:VC,
  author =       "S. J. McRae",
  title =        "{VM} communications",
  crossref =     "Anonymous:1992:PSE",
  pages =        "439--453",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Soft-Switch Ltd., Reading, UK",
  classification = "C6150J (Operating systems); C6155 (Computer
                 communications software)",
  keywords =     "3270 Protocols; APPC; Business needs; Client/server
                 communications; Communication offerings; Communications
                 infra-structure; IBM host system; LANRES; MVS; OSI
                 connectivity; PVM; RSCS; SAA communications strategy;
                 SNA connectivity; TCP/IP; TCP/IP connectivity; VM;
                 VM/ESA; X.25 communications",
  thesaurus =    "Computer communications software; Operating systems
                 [computers]",
}

@InProceedings{Otto:1992:MAP,
  author =       "S. W. Otto and M. Wolfe",
  title =        "The {MetaMP} approach to parallel programming",
  crossref =     "Siegel:1992:FFS",
  pages =        "562--565",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oregon Graduate Inst., Beaverton, OR, USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages)",
  keywords =     "MetaMP; Parallel programming",
  thesaurus =    "High level languages; Parallel programming",
}

@InProceedings{Shen:1992:VTD,
  author =       "S. Shen and L. Kleinrock",
  title =        "The virtual-time data-parallel machine",
  crossref =     "Siegel:1992:FSF",
  pages =        "46--53",
  year =         "1992",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming)",
  conflocation = "McLean, VA, USA; 19-21 Oct. 1992",
  corpsource =   "Dept. of Comput. Sci., California Univ., Los Angeles,
                 CA, USA",
  keywords =     "asynchronous execution; computation-intensive
                 data-parallel; FIFO priory cache; parallel machines;
                 parallel programming; processing element; programs;
                 SIMD; single instruction multiple data; virtual-time
                 data-parallel machine",
  sponsororg =   "IEEE; NASA",
  treatment =    "P Practical",
}

@InProceedings{Sunderam:1992:CCP,
  author =       "Vaidy Sunderam",
  title =        "Concurrent Computing with {PVM}",
  crossref =     "SCRI:1992:PWC",
  year =         "1992",
  bibsource =    "Distributed/clusters.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Wolbers:1992:SPP,
  author =       "S. Wolbers",
  title =        "Software for parallel processing applications",
  crossref =     "Verkerk:1992:PIC",
  pages =        "111--116",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Fermilab, Batavia, IL, USA",
  classification = "A2980 (Nuclear information processing); C6110P
                 (Parallel programming); C7320 (Physics and Chemistry)",
  keywords =     "ACPMAPS; CANOPY; Cooperative processes software;
                 High-energy physics; Lattice QCD; Monte Carlo
                 generation; Offline event reconstruction; Parallel
                 processing; Tightly-coupled machines; Workstation
                 clusters",
  thesaurus =    "Monte Carlo methods; Parallel programming; Physics
                 computing",
}

@Article{Almasi:1993:PDS,
  author =       "G. S. Almasi and T. McLuckie and J. Bell and A.
                 Gordon",
  title =        "Parallel distributed seismic migration",
  journal =      j-CPE,
  volume =       "5",
  number =       "2",
  pages =        "105--131",
  month =        apr,
  year =         "1993",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  classification = "A9130 (Seismology); C5440 (Multiprocessor systems
                 and techniques); C7340 (Geophysics)",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "15 MFLOPS; Ethernet; IBM RISC/6000 workstations;
                 Linda; Parallel distributed seismic migration;
                 Performance; Programming models; PVM; Remote procedure
                 calls; Token ring",
  numericalindex = "Computer speed 1.5E+07 FLOPS",
  pubcountry =   "UK",
  thesaurus =    "Geophysics computing; Parallel processing;
                 Seismology",
}

@Article{Altevogt:1993:PTD,
  author =       "P. Altevogt and A. Linke",
  title =        "Parallelization of the two-dimensional {Ising} model
                 on a cluster of {IBM RISC System\slash 6000}
                 workstations",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "19",
  number =       "9",
  pages =        "1041--1052",
  month =        sep,
  year =         "1993",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sci. Center, IBM, Heidelberg, Germany",
  classification = "A0550 (Lattice theory and statistics; C5220P
                 (Parallel architecture); C7320 (Physics and Chemistry);
                 Ising problems)",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "IBM RISC System/6000 workstations; Metropolis
                 algorithm; Multispin coding; NSC DX Router; PVM
                 programming environment; Token ring; Two-dimensional
                 Ising model",
  pubcountry =   "Netherlands",
  thesaurus =    "Ising model; Physics computing; Reduced instruction
                 set computing",
}

@Article{Anonymous:1993:MMP,
  author =       "Anonymous",
  title =        "{MPI}: a message passing interface",
  journal =      j-PROC-SUPERCOMPUT,
  pages =        "878--883",
  month =        "????",
  year =         "1993",
  CODEN =        "????",
  ISBN =         "0-8186-4340-4",
  ISBN-13 =      "978-0-8186-4340-8",
  ISSN =         "1063-9535",
  LCCN =         "QA76.5 .S894 1993",
  bibdate =      "Fri May 24 09:57:40 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper presents an overview of MPI, a proposed
                 standard message passing interface for MIMD distributed
                 memory concurrent computers, The design of MPI has been
                 a collective effort involving researchers in the United
                 States and Europe from many organization and
                 institutions. MPI includes point-to-point and
                 collective communication routines, as well as support
                 for process groups, communication contexts, and
                 application topologies. While making use of new ideas
                 where appropriate, the MPI standard is based largely on
                 current practice.",
  acknowledgement = ack-nhfb,
  classification = "723; 902.2; C6150N (Distributed systems software)",
  conference =   "Proceedings of the Supercomputing '93 Conference",
  conferenceyear = "1993",
  fjournal =     "Proceedings of the Supercomputing Conference",
  journalabr =   "Proc Supercomputing Conf",
  keywords =     "Application topologies; application topologies;
                 collective communication routines; Collective
                 communication routines; collective communication
                 routines; Communication contexts; communication
                 contexts; Distributed computer systems; message
                 passing; Message passing interface; MIMD distributed
                 memory concurrent computers; MPI; MPI standard; MPI
                 standard overview; Point-to-point communication;
                 point-to-point communication; process groups; Process
                 groups; process groups; software standards; standard
                 message passing interface; Standard message passing
                 interface; standard message passing interface;
                 Standards",
  meetingaddress = "Portland, OR, USA",
  meetingdate =  "Nov 15--19 1993",
  meetingdate2 = "11/15--19/93",
  publisherinfo = "Computer Society Press",
  sponsor =      "IEEE Computer Society; ACM SIGARCH",
  sponsororg =   "IEEE; ACM SIGARCH",
  treatment =    "P Practical",
}

@Article{Anonymous:1993:MPI,
  author =       "Anonymous",
  title =        "Message-Passing Interface",
  journal =      j-IJSA,
  volume =       "7",
  number =       "2",
  pages =        "179--179",
  month =        jun,
  year =         "1993",
  CODEN =        "IJSAE9",
  DOI =          "https://doi.org/10.1177/109434209300700208",
  ISSN =         "0890-2720",
  bibdate =      "Tue Nov 6 11:28:49 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434209300700208",
  acknowledgement = ack-nhfb,
  fjournal =     "The International Journal of Supercomputer
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@TechReport{Arthur:1993:CUA,
  author =       "Trey Arthur and Michael J. Bockelie",
  title =        "A comparison of using {APPL} and {PVM} for a parallel
                 implementation of an unstructured grid generation
                 problem",
  number =       "NASA CR-191425",
  institution =  "National Aeronautics and Space Administration, Langley
                 Research Center; National Technical Information
                 Service, distributor",
  address =      "Hampton, VA, USA",
  pages =        "??",
  year =         "1993",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "NASA contractor report",
  acknowledgement = ack-nhfb,
  annote =       "Distributed to depository libraries in microfiche.
                 Shipping list no.: 93-1026-M. Microfiche. [Washington,
                 DC: National Aeronautics and Space Administration,
                 1993] 1 microfiche.",
  govtdocnumber = "NAS 1.26:191425 0830-H-14 (MF)",
  keywords =     "Numerical grid generation (Numerical analysis)",
}

@InProceedings{Arthur:1993:PIU,
  author =       "T. Arthur and M. Bockelie",
  title =        "A Parallel Implementation of the Unstructured Grid
                 Generation Program {VGRIDSG} Using {PVM} and {APPL}",
  crossref =     "Sincovec:1993:SCP",
  pages =        "899--902",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Baiardi:1993:PVM,
  author =       "F. Baiardi and M. Jazayeri",
  title =        "{P03M}: a Virtual Machine Approach to Massively
                 Parallel Computing",
  journal =      j-PROC-INT-CONF-PAR-PROC,
  pages =        "I-340--??",
  month =        "????",
  year =         "1993",
  CODEN =        "PCPADL",
  ISSN =         "0190-3918",
  LCCN =         "QA76.6.I548a",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the International Conference on
                 Parallel Processing",
}

@InProceedings{Baraglia:1993:PWC,
  author =       "R. Baraglia and D. Laforenza and R. Perego",
  title =        "Programming a workstation cluster with {PVM} and
                 {Linda}: a qualitative and quantitative comparison",
  crossref =     "Anonymous:1993:ISA",
  pages =        "101--114",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Barth:1993:CNM,
  author =       "N. H. Barth and S. L. Smith",
  title =        "Coupling Numerical Models of the Atmosphere and Ocean
                 Using the Parallel Virtual Machine ({PVM}) Package",
  crossref =     "Sincovec:1993:SCP",
  pages =        "71--75",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bedrosian:1993:MFA,
  author =       "G. Bedrosian and R. W. Benway",
  title =        "Magnetostatic finite-element analysis on {MIMD\slash
                 DMMP} parallel computers",
  crossref =     "Yelon:1993:PTS",
  journal =      j-J-APPL-PHYS,
  volume =       "73",
  number =       "10",
  pages =        "6772--6777",
  year =         "1993",
  CODEN =        "JAPIAU",
  ISSN =         "0021-8979 (print), 1089-7550 (electronic), 1520-8850",
  ISSN-L =       "0021-8979",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "GE Corporate Research and Development, Schenectady,
                 NY, USA",
  classification = "A0260 (Numerical approximation and analysis); A4110D
                 (Electrostatics, magnetostatics); B0290T (Finite
                 element analysis); B5120 (Magnetostatics)",
  fjournal =     "Journal of Applied Physics",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=4915369",
  keywords =     "Distributed-memory; H3D; In-house magnetostatic
                 finite-element analysis code; Intel iPSC/860 Hypercube;
                 Local area network; Message-passing; MIMD/DMMP parallel
                 computers; Multiple closely coupled CPUs;
                 Multiple-data; Multiple-instruction; Networks of
                 heterogeneous workstations; Parallel virtual machine;
                 Porting; Supercomputers",
  thesaurus =    "Finite element analysis; Magnetic fields",
}

@InProceedings{Beguelin:1993:PEC,
  author =       "A. Beguelin and J. Dongarra and A. Geist and R.
                 Manchek and S. Otto and J. Walpole",
  title =        "{PVM}: {Experiences}, current status and future
                 direction",
  crossref =     "IEEE:1993:PSP",
  pages =        "765--766",
  year =         "1993",
  bibdate =      "Thu Apr 16 08:51:18 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software); C7320 (Physics and chemistry
                 computing); C7410D (Electronic engineering computing)",
  corpsource =   "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  keywords =     "circuit analysis; Circuit analysis; circuit analysis;
                 computational problems; Computational requirements;
                 computational requirements; computing; computing
                 requirements; Computing requirements; computing
                 requirements; concurrent; Concurrent computing;
                 concurrent computing; concurrent processing; Concurrent
                 processing; concurrent processing; electronic
                 engineering computing; engineering design; Engineering
                 design; engineering design; Hardware multiprocessors;
                 hardware multiprocessors; high-; High-performance
                 applications; high-performance applications;
                 Integration aspects; integration aspects; material
                 sciences; Material sciences; material sciences;
                 multiprocessing programs; package; parallel processors;
                 Parallel processors; parallel processors; parallel
                 programming; parallel virtual machine; Parallel virtual
                 machine; parallel virtual machine; performance
                 applications; Physical sciences; physical sciences;
                 physics computing; PVM; scientific; Scientific
                 computational problems; scientific computational
                 problems; Simulation; simulation; software; Software
                 package; software package; software packages",
  sponsororg =   "IEEE; ACM SIGARCH",
  treatment =    "P Practical",
}

@InCollection{Beguelin:1993:PHT,
  author =       "A. Beguelin and J. Dongarra and A. Geist and R.
                 Manchek and K. Moore and V. Sunderam",
  editor =       "J. S. Kowalik and L. Grandinetti",
  title =        "{PVM} and {HeNCE}: Tools for Heterogeneous Network
                 Computing",
  crossref =     "Kowalik:1993:SPC",
  pages =        "??--??",
  year =         "1993",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Parallel/Par.Arch.Indep.bib;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/Par.Arch.Indep.bib",
  acknowledgement = ack-nhfb,
}

@Article{Beguelin:1993:VDH,
  author =       "Adam Beguelin and Jack Dongarra and Al Geist and V.
                 Sunderam",
  title =        "Visualization and Debugging in a Heterogeneous
                 Environment",
  journal =      j-COMPUTER,
  volume =       "26",
  number =       "6",
  pages =        "88--95",
  month =        jun,
  year =         "1993",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; UnCover
                 library database",
  abstract =     "A monitoring tool and a graphical interface working on
                 top of the PVM software can help programmers make
                 better use of heterogeneous networks of computers.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6115 (Programming support); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems)",
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
  keywords =     "Debugging; Graphical monitoring package; Graphical
                 programming environment; Hence; Heterogeneous
                 distributed programs; Heterogeneous environment;
                 Parallel virtual machine; Program visualisation; Xab",
  thesaurus =    "Multiprocessing programs; Open systems; Parallel
                 programming; Program debugging; Software tools; System
                 monitoring; Virtual machines; Visual programming",
}

@InProceedings{Beguelin:1993:XAT,
  author =       "Adam Beguelin",
  title =        "Xab: a Tool for Monitoring {PVM} Programs",
  crossref =     "IEEE:1993:WHP",
  pages =        "92--97",
  year =         "1993",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/debug_3.1.bib",
}

@TechReport{Beguelin:1993:XTMa,
  author =       "Adam L. Beguelin",
  title =        "Xab: a tool for monitoring {PVM} programs",
  type =         "Research paper",
  number =       "CMU-CS-93-164",
  institution =  inst-SCS-CMU,
  address =      inst-SCS-CMU:adr,
  pages =        "8",
  year =         "1993",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Techreports/tr.misc.bib",
  abstract =     "Xab (X-window Analysis and deBugging) is a tool for
                 run time monitoring of PVM (Parallel Virtual Machine)
                 programs. PVM supports the programming of a network of
                 heterogeneous computers as a single parallel computer.
                 Using Xab, PVM programs can easily be instrumented and
                 monitored. Xab uses PVM to monitor PVM programs. This
                 makes Xab very portable but it leads to interesting
                 issues of how to make Xab peacefully coincide with the
                 programs it monitors. Xab consists of three main
                 components, a user library, a monitoring program, and
                 an X windows front end. The user library provides
                 instrumented versions of the PVM calls. The monitoring
                 program runs as a PVM process and gathers monitor
                 events in the form of PVM messages. The Xab front end
                 displays information graphically about PVM processes
                 and messages. This paper discusses the design,
                 implementation, and use of the Xab tool. Related work
                 is briefly presented and contrasted with the approach
                 taken with Xab. How Xab works and how it is used are
                 discussed in detail. Finally, the current status of Xab
                 is presented along with future directions of where the
                 research may go from here.",
  acknowledgement = ack-nhfb,
  annote =       "This paper also appears in the proceedings of the
                 April 1993 Workshop on Heterogeneous Processing, IEEE
                 Computer Society Press. June 2, 1993.",
  keywords =     "Debugging in computer science; Parallel programming
                 (Computer science)",
}

@InProceedings{Beguelin:1993:XTMb,
  author =       "A. L. Beguelin",
  title =        "Xab: a tool for monitoring {PVM} programs",
  crossref =     "Mudge:1993:PTS",
  volume =       "2",
  pages =        "102--103 (vol. 2) (or 4--??)",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6115 (Programming support); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C7430
                 (Computer engineering)",
  corpsource =   "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  keywords =     "Feedback; feedback; Heterogeneity; heterogeneity;
                 heterogeneous; Heterogeneous multiprogramming
                 environment; Monitoring PVM programs; monitoring PVM
                 programs; multiprogramming; multiprogramming
                 environment; Parallel virtual machine; parallel virtual
                 machine; performance evaluation; program testing; Run
                 time monitoring tool; run time monitoring tool;
                 software tools; virtual machines; Xab",
  sponsororg =   "ACM; IEEE",
  thesaurus =    "Multiprogramming; Performance evaluation; Program
                 testing; Software tools; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Castro-Leon:1993:MCP,
  author =       "E. Castro-Leon",
  title =        "A model of computation with parallel solvers",
  crossref =     "Anonymous:1993:SEC",
  pages =        "189--198",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Supercomputer Syst. Div., Beaverton, OR, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  keywords =     "Data parallel programming; Distributed memory
                 computers; Message passing; Parallel libraries;
                 Parallel solvers; Performance; Programming
                 environments; Rehosting",
  thesaurus =    "Distributed memory systems; Parallel programming;
                 Programming environments",
}

@MastersThesis{Cavender:1993:APV,
  author =       "Mark Edward Cavender",
  title =        "Asynchronous parallel virtual machine",
  type =         "M.S. thesis",
  school =       "University of Texas at San Antonio. Division of
                 Mathematics and Computer Science and Statistics",
  address =      "San Antonio, TX, USA",
  pages =        "vi + 228",
  year =         "1993",
  bibdate =      "Mon Jan 15 18:16:25 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Parallel processing (Electronic computers); Virtual
                 computer systems.",
}

@InProceedings{Chandrasekharan:1993:RTB,
  author =       "N. Chandrasekharan and V. Goel",
  title =        "Ray tracing and binary tree computations using {PVM}",
  crossref =     "Mudge:1993:PTS",
  volume =       "2",
  pages =        "104--105 (vol. 2)",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Univ. of Central Florida,
                 Orlando, FL, USA",
  classification = "C6130B (Graphics techniques); C6150J (Operating
                 systems); C7430 (Computer engineering)",
  corpsource =   "Dept. of Comput. Sci., Univ. of Central Florida,
                 Orlando, FL, USA",
  keywords =     "Binary tree computations; binary tree computations;
                 Computational problems; computational problems;
                 Parallel virtual machine; parallel virtual machine;
                 problem; PVM; ray; Ray tracing; ray tracing; rendering
                 (computer graphics); rendering computer; Rendering
                 computer synthesized images; scheduling; Scheduling
                 technique; scheduling technique; synthesized images;
                 tracing; tree contraction; Tree contraction problem;
                 virtual machines",
  sponsororg =   "ACM; IEEE",
  thesaurus =    "Ray tracing; Rendering [computer graphics];
                 Scheduling; Virtual machines",
  treatment =    "A Application; P Practical",
}

@Article{Chatterjee:1993:GLA,
  author =       "S. Chatterjee and J. R. Gilbert and F. J. E. Long and
                 R. Schreiber and S.-H. Teng",
  title =        "Generating local addresses and communication sets for
                 data-parallel programs",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "7",
  pages =        "149--158",
  month =        jul,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "NASA Ames Res. Center, Moffett Field, CA, USA",
  classification = "C4220 (Automata theory); C6110P (Parallel
                 programming); C6140D (High level languages)",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Communication sets; Data-parallel languages;
                 Distributed-memory implementations; Fast algorithms;
                 Local memory access sequence; Multidimensional arrays;
                 State machines",
  thesaurus =    "Distributed memory systems; Finite automata; FORTRAN;
                 Parallel programming",
}

@InProceedings{Colombet:1993:SMI,
  author =       "L. Colombet and L. Desbat and F. Menard",
  title =        "Star Modeling on {IBM RS6000} Networks Using {PVM}",
  crossref =     "IEEE:1993:PIS",
  pages =        "121--128",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "LMC-IMAG, Grenoble, France",
  classification = "C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing)",
  corpsource =   "LMC-IMAG, Grenoble, France",
  keywords =     "architectures; evaluation; Heterogeneous networks;
                 heterogeneous networks; heterogeneous parallel;
                 Heterogeneous parallel architectures; IBM RS6000; IBM
                 RS6000 networks; Monte Carlo methods; Monte Carlo
                 radiative transfer code; networks; parallel; parallel
                 architectures; Parallel performances; parallel virtual
                 machine; Parallel virtual machine; performance;
                 performances; PVM; star modelling; Star modelling",
  sponsororg =   "IEEE; Washington State Univ.; NPAC at Syracuse Univ.;
                 ACM; Washington Technol. Center",
  thesaurus =    "Monte Carlo methods; Parallel architectures;
                 Performance evaluation",
  treatment =    "P Practical",
}

@InProceedings{Coussement:1993:PMO,
  author =       "G. Coussement",
  title =        "Parallelization of a mesh optimization code on a
                 {RS\slash} 6000 cluster",
  crossref =     "Anonymous:1993:PSE",
  pages =        "185--212",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Aerodynamics Dept., Office Nat. d'Etudes et de
                 Recherches Aerospatiales, Chatillon, France",
  classification = "C1180 (Optimisation techniques); C6110P (Parallel
                 programming)",
  keywords =     "Code structure; Communication protocol; IBM RS/6000;
                 Multi-domain structured mesh optimization code;
                 OPTIM3D; Parallelization effort; PVM; Three-dimensional
                 mesh optimization method",
  thesaurus =    "IBM computers; Optimisation; Parallel programming",
}

@Article{Culler:1993:LTR,
  author =       "David E. Culler and Richard M. Karp and David A.
                 Patterson and Abhijit Sahay and Klaus E. Schauser and
                 Eunice Santos and Ramesh Subramonian and Thorsten von
                 Eicken",
  title =        "{LogP}: towards a realistic model of parallel
                 computation",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "7",
  pages =        "1--12",
  month =        jul,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 14 18:49:37 MST 1995",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Comput. Sci., California Univ., Berkeley, CA,
                 USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C7430 (Computer
                 engineering)",
  confdate =     "19-22 May 1993",
  conflocation = "San Diego, CA, USA",
  confsponsor =  "ACM",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "CM-5; Communication bandwidth; Communication delay;
                 Computing bandwidth; LogP; Machine configuration;
                 Machine designers; Parallel computers; Parallel machine
                 model; Portable parallel algorithms",
  thesaurus =    "Parallel algorithms; Parallel machines; Parallel
                 programming; Virtual machines",
}

@InProceedings{daCunha:1993:PLA,
  author =       "R. D. da Cunha and T. Hopkins",
  title =        "Porting linear algebra subroutines from transputers to
                 clusters of workstations",
  crossref =     "Grebe:1993:TAS",
  pages =        "660--667",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Comput. Lab., Kent Univ., Canterbury, UK",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C6110B
                 (Software engineering techniques); C6150N (Distributed
                 systems); C7310 (Mathematics)",
  keywords =     "Fortran77; Linear algebra subroutines; Message-passing
                 system; Occam2; Parallel Virtual Machine; PVM;
                 Subroutine porting; Transputers; Workstation clusters",
  thesaurus =    "FORTRAN; Linear algebra; Mathematics computing;
                 Message passing; Occam; Software portability;
                 Subroutines; Transputer systems",
}

@Article{Damodaran-Kamal:1993:NTD,
  author =       "S. K. Damodaran-Kamal and J. M. Francioni",
  title =        "Nondeterminacy: testing and debugging in message
                 passing parallel programs",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "12",
  pages =        "118--128",
  month =        dec,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Southwestern Louisiana Univ.,
                 Lafayette, LA, USA",
  classification = "C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems)",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Debugging tool; Mdb; Message passing parallel
                 programs; Nondeterminacy; Parallel program; Program
                 errors; PVM programs; Testing tool",
  thesaurus =    "Message passing; Parallel programming; Program
                 debugging; Program testing",
}

@InProceedings{Despons:1993:CCP,
  author =       "R. Despons and T. Muntean",
  title =        "Constructing correct protocols for a diffusion virtual
                 machine in message passing parallel architectures",
  crossref =     "Grebe:1993:TAS",
  pages =        "465--480",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C5640
                 (Protocols); C6150N (Distributed systems); C7430
                 (Computer engineering)",
  corpsource =   "IMAG-LGI Lab., Grenoble Univ., France",
  keywords =     "architectures; communication protocols; diffusion
                 protocols; diffusion virtual machine; machines;
                 massively parallel architectures; message passing;
                 parallel; parallel algorithms; parallel applications;
                 parallel architectures; parallel machines; programming
                 environments; protocols; virtual machines",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@InProceedings{Dongarra:1993:DSM,
  author =       "J. J. Dongarra and R. Hempel and A. J. G. Hey and D.
                 W. Walker",
  title =        "A draft standard for message passing in a distributed
                 memory environment",
  crossref =     "Hoffmann:1993:PFE",
  pages =        "465--481",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C6150N
                 (Distributed systems)",
  keywords =     "C language; Data distribution transformations;
                 Distributed memory environment; Draft standard; Fortran
                 77; Library interface standard; Message passing;
                 Message Passing Interface 1; Message selectivity;
                 Message type; MPI1; Source process",
  thesaurus =    "Distributed memory systems; Message passing",
}

@Article{Dongarra:1993:IPF,
  author =       "Jack Dongarra and G. A. Geist and Robert Manchek and
                 V. S. Sunderam",
  title =        "Integrated {PVM} Framework Supports Heterogeneous
                 Network Computing",
  journal =      j-COMPUT-PHYS,
  volume =       "7",
  number =       "2",
  pages =        "166--174 (or 166--175??)",
  month =        mar # "--" # apr,
  year =         "1993",
  CODEN =        "CPHYE2",
  ISSN =         "0894-1866 (print), 1558-4208 (electronic)",
  ISSN-L =       "0894-1866",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Tennessee Univ., Knoxville, TN, USA",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems); C7300 (Natural sciences); C7430 (Computer
                 engineering)",
  corpsource =   "Tennessee Univ., Knoxville, TN, USA",
  fjournal =     "Computers in Physics",
  keywords =     "computer networks; computing; Concurrent applications;
                 concurrent applications; distributed processing;
                 Heterogeneous network computing; heterogeneous network
                 computing; Integrated framework; integrated framework;
                 natural sciences; parallel programming; Parallel
                 virtual machine software; parallel virtual machine
                 software; Scientific computations; scientific
                 computations; virtual machines",
  thesaurus =    "Computer networks; Distributed processing; Natural
                 sciences computing; Parallel programming; Virtual
                 machines",
  treatment =    "G General Review; P Practical",
}

@TechReport{Dongarra:1993:PUM,
  author =       "J. Dongarra and R. Hempel and A. Hay and D. Walker",
  title =        "A Proposal for a User-Level Message Passing Interface
                 in a Distributed Memory Environment",
  type =         "Technical Report",
  number =       "ORNL/TM-12231",
  institution =  inst-ORNL,
  address =      inst-ORNL:adr,
  month =        feb,
  year =         "1993",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Parallel/par.lin.alg.bib;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/par.lin.alg.bib",
}

@InProceedings{Dongarra:1993:UPR,
  author =       "J. J. Dongarra and A. Geist and R. Manchek and W.
                 Jiang",
  title =        "Using {PVM} 3.0 to Run Grand Challenge Applications on
                 a Heterogeneous Network of Parallel Computers",
  crossref =     "Sincovec:1993:SCP",
  pages =        "873--877",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ewing:1993:DCW,
  author =       "R. E. Ewing and D. Mitchum and P. O'Leary and R. C.
                 Sharpley and J. S. Sochacki",
  title =        "Distributed Computation of Wave Propagation Models
                 Using {PVM}",
  crossref =     "IEEE:1993:PSP",
  pages =        "22--31",
  year =         "1993",
  bibdate =      "Wed Apr 15 12:04:03 MDT 1998",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Texas A\&M Univ",
  affiliationaddress = "College Station, TX, USA",
  classification = "484.1; 723; 921; C5440 (Multiprocessing systems);
                 C6110P (Parallel programming); C7340 (Geophysics
                 computing); C7430 (Computer engineering)",
  corpsource =   "Inst. for Sci. Comput., Texas A and M Univ., College
                 Station, TX, USA",
  keywords =     "Computer simulation; Computer workstations;
                 distributed computation; distributed memory systems;
                 Earth; geophysics computing; handling large-scale
                 problems; IBM RS/6000s; Large-earth models; large-scale
                 computations; Large-scale problems; nodes; numerical
                 approximation; parallel processing; parallel processing
                 environment; Parallel processing environment; Parallel
                 processing systems; Parallel Virtual Machine; Parallel
                 virtual machine (PVM); PVM; Seismic wave propagation;
                 seismic waves; Seismic waves; supercomputers; timings;
                 virtual machines; visualization; wave propagation; Wave
                 propagation; wave propagation models; Wave propagation
                 models; workstations",
  sponsororg =   "IEEE; ACM SIGARCH",
  treatment =    "P Practical",
}

@InProceedings{Fritscher:1993:PDC,
  author =       "J. F. Fritscher and F. Sukup",
  title =        "{93SC038} Parallel Distributed Computing Using {PVM}",
  crossref =     "Anonymous:1993:ATA",
  pages =        "221--228",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Geist:1993:EPC,
  author =       "G. A. Geist and V. S. Sunderam",
  title =        "The evolution of the {PVM} concurrent computing
                 system",
  crossref =     "IEEE:1993:DPC",
  pages =        "549--557",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat. Lab., TN, USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C7430 (Computer
                 engineering)",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "future; Future trends; high-performance computations;
                 High-performance computations; historical evolution;
                 Historical evolution; networked environments; Networked
                 environments; ongoing research projects; Ongoing
                 research projects; parallel; parallel machines;
                 parallel programming; Parallel programming; Parallel
                 Virtual Machine; programming; programming model;
                 Programming model; PVM concurrent computing system;
                 scientific; Scientific supercomputing; software
                 infrastructure; Software infrastructure;
                 supercomputing; trends; virtual machines",
  thesaurus =    "Parallel machines; Parallel programming; Virtual
                 machines",
  treatment =    "P Practical",
}

@InProceedings{Geist:1993:ILP,
  author =       "G. A. Geist",
  title =        "Invited Lecture: {PVM} 3 Beyond Network Computing",
  crossref =     "Volkert:1993:PCS",
  pages =        "194--203",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Geist:1993:PBN,
  author =       "G. A. Geist",
  title =        "{PVM} 3 beyond network computing",
  crossref =     "Volkert:1993:PCS",
  pages =        "194--203",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat. Lab., TN, USA",
  classification = "C6115 (Programming support); C6150N (Distributed
                 systems); C7430 (Computer engineering)",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "distributed computing; Distributed computing;
                 distributed memory computer; Distributed memory
                 computer; distributed memory systems; heterogeneous
                 network computing; Heterogeneous network computing;
                 machines; message-; Message-passing constructs;
                 parallel machines; Parallel Virtual Machine; passing
                 constructs; PVM 3; software package; Software package;
                 virtual",
  pubcountry =   "Germany",
  thesaurus =    "Distributed memory systems; Parallel machines; Virtual
                 machines",
  treatment =    "P Practical",
}

@InProceedings{Geist:1993:PTW,
  author =       "A. Geist and J. Dongarra and A. Beguelin and B.
                 Manchek and Weicheng Jiang",
  title =        "{PVM} takes over the world",
  crossref =     "IEEE:1993:PSP",
  pages =        "618--618",
  year =         "1993",
  DOI =          "https://doi.org/10.1109/SUPERC.1993.1263513",
  bibdate =      "Fri May 27 10:20:49 2005",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Glendinning:1993:MMP,
  author =       "I. Glendinning",
  title =        "{93SC041} The {MPI} Message Passing Interface",
  crossref =     "Anonymous:1993:ATA",
  pages =        "229--236",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Hariri:1993:MPI,
  author =       "S. Hariri and J. B. Park and F.-K. Yu and M. Parashar
                 and G. C. Fox",
  title =        "A message passing interface for parallel and
                 distributed computing",
  crossref =     "IEEE:1993:PIS",
  pages =        "84--91",
  year =         "1993",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); C5640 (Protocols); C5670
                 (Network performance)",
  corpsource =   "Northeast Parallel Archit. Center, Syracuse Univ., NY,
                 USA",
  keywords =     "architectural support; communication protocol;
                 distributed computing; distributed processing; gigabit
                 networks; message passing; message passing interface;
                 parallel computing; parallel processing; performance
                 evaluation; protocols; supercomputing capabilities",
  sponsororg =   "IEEE; Washington State Univ.; NPAC at Syracuse Univ.;
                 ACM; Washington Technol. Center",
  treatment =    "P Practical",
}

@InProceedings{Hartley:1993:CPS,
  author =       "C. L. Hartley and V. S. Sunderam",
  title =        "Concurrent programming with shared objects in
                 networked environments",
  crossref =     "IEEE:1993:PSI",
  pages =        "471--478",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6150N (Distributed systems)",
  keywords =     "Application development; Concurrent programming;
                 Distributed computing; Ease of use; Message passing;
                 Networked computing platforms; Networked environments;
                 Object-oriented techniques; Partitioning; Portable
                 software systems; PVM distributed computing system;
                 Scheduling; Shared objects; Shared-object concurrent
                 computation; Synchronization; Toolkit",
  thesaurus =    "Multiprocessing programs; Object-oriented programming;
                 Parallel programming; Software tools",
}

@InProceedings{Hebeker:1993:CPC,
  author =       "F.-K. Hebeker",
  title =        "On a coarse-grained parallel code to simulate reactive
                 flows on an {IBM RS\slash} 6000 workstation-cluster",
  crossref =     "Brebbia:1993:ASE",
  pages =        "253--262",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM Heidelberg Sci. Center, Germany",
  classification = "A4710 (General fluid dynamics theory, simulation and
                 other computational methods); A4770F (Chemically
                 reactive flows); C6110P (Parallel programming); C7440
                 (Civil and mechanical engineering computing); C7450
                 (Chemical engineering computing)",
  keywords =     "Algorithmic development; Chemical source terms;
                 Coarse-grained parallel code; Compressible flow;
                 Compressible Navier--Stokes equations; Domain splitting
                 techniques; Engineering-mathematical modelling; Global
                 exothermic reaction chemistry; IBM RS/6000 workstation
                 cluster; Internal combustion engines; Knock damage;
                 Message passing; Numerical simulation; Optimally
                 adapted code; Performance measurements; PVM programming
                 environment; Reactive flow simulation; Semi-implicit
                 treatment; Shock-capturing finite-volume scheme",
  thesaurus =    "Chemical engineering computing; Chemically reactive
                 flow; Digital simulation; Distributed algorithms; Flow
                 simulation; IBM computers; Internal combustion engines;
                 Mechanical engineering computing; Message passing;
                 Parallel programming",
}

@Article{Jesshope:1993:LRV,
  author =       "C. Jesshope",
  title =        "Latency reduction in {VLSI} routers",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "3",
  number =       "4",
  pages =        "485--494",
  month =        dec,
  year =         "1993",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electron. and Electr. Eng., Surrey Univ.,
                 Guildford, UK",
  classification = "C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing)",
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
  keywords =     "Latency reduction; MPI router chip; Parallel
                 computers; Scalable performance; VLSI routers",
  pubcountry =   "Singapore",
  thesaurus =    "Fault tolerant computing; Parallel architectures;
                 Performance evaluation; VLSI",
}

@InProceedings{Jesshope:1993:MCA,
  author =       "C. Jesshope",
  title =        "The {MPI} Chip and its Applications",
  crossref =     "Anonymous:1993:JFI",
  pages =        "47--54",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@MastersThesis{Katamneni:1993:PPE,
  author =       "Sreevenu Katamneni",
  title =        "Parallel processing extensions to {Verilog HDL} using
                 the {PVM} environment",
  type =         "M.S.E.E. thesis",
  school =       inst-UAL-EE,
  address =      inst-UAL-EE:adr,
  pages =        "viii + 108",
  year =         "1993",
  bibdate =      "Mon Jan 15 18:16:30 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer hardware description languages.; Parallel
                 processing (Electronic computers); Verilog (Computer
                 hardware description language); Virtual computer
                 systems.",
}

@Article{Kikuchi:1993:PAS,
  author =       "S. Kikuchi",
  title =        "Parallelization assist system",
  journal =      j-JOHO-SHORI,
  volume =       "34",
  number =       "9",
  pages =        "1158--1169",
  month =        sep,
  year =         "1993",
  CODEN =        "JOSHA4",
  ISSN =         "0447-8053",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Central Res. Lab., Hitachi Ltd, Tokyo, Japan",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  fjournal =     "Joho-Shori (J. Information Processing Soc. Japan)",
  keywords =     "ASPAR; Express; Flow-insensitive systems;
                 Flow-sensitive systems; FORGE9O; KAP; Message passing
                 interface standards; MIMDizer; ParaGraph; Parallel
                 Fortran Converter; Parallelization assistance system;
                 ParaScope Editor; Parassist; Perfect club benchmarks;
                 PIE; Portable Instrumented Communication Library;
                 Profiling tools; PTOOL; SUPERB; SUPRENUM FORTRAN;
                 Transformations; VAST",
  language =     "Japanese",
  pubcountry =   "Japan",
  thesaurus =    "FORTRAN; Parallel programming; Reduced instruction set
                 computing; Software tools",
}

@Article{Kranz:1993:IMP,
  author =       "David Kranz and Kirk L. Johnson and Anant Agarwal and
                 John Kubiatowicz and Beng-Hong Lim",
  title =        "Integrating message-passing and shared-memory: early
                 experience",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "7",
  pages =        "54--63",
  month =        jul,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 14 18:49:37 MST 1995",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "A discussion is given on some of the issues involved
                 in implementing a shared-address space programming
                 model on large-scale, distributed-memory
                 multiprocessors. While such a programming model can be
                 implemented on both shared-memory and message-passing
                 architectures, the authors argue that the transparent,
                 coherent caching of global data provided by many
                 shared-memory architectures is of crucial importance.
                 Because message-passing mechanisms are much more
                 efficient than shared-memory loads and stores for
                 certain types of interprocessor communication and
                 synchronization operations, however, the authors argue
                 for building multiprocessors that efficiently support
                 both shared-memory and message-passing mechanisms. The
                 authors describe an architecture, Alewife, that
                 integrates support for shared-memory and
                 message-passing through a simple interface; they expect
                 the compiler and runtime system to cooperate in using
                 appropriate hardware mechanisms that are most efficient
                 for specific operations. They report on both integrated
                 and exclusively shared-memory implementations of the
                 runtime system and two applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming)",
  confdate =     "19-22 May 1993",
  conflocation = "San Diego, CA, USA",
  confsponsor =  "ACM",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Alewife; Coherent caching; Compiler;
                 Distributed-memory multiprocessors; Exclusively
                 shared-memory implementations; Global data; Hardware
                 mechanisms; Interprocessor communication;
                 Message-passing architectures; Message-passing
                 mechanisms; Runtime system; Shared-address space
                 programming model; Shared-memory architectures;
                 Shared-memory loads; Synchronization operations",
  thesaurus =    "Message passing; Parallel programming; Shared memory
                 systems",
}

@TechReport{Leon:1993:FPA,
  author =       "J. Leon and A. L. Fisher and P. Steenkiste",
  title =        "Fail-safe {PVM}: a portable package for distributed
                 programming with transparent recovery",
  number =       "CMU-CS-93-124",
  institution =  "Carnegie-Mellon University, Department of Computer
                 Science",
  year =         "1993",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Techreports/tr.misc.bib",
}

@TechReport{Leon:1993:FPP,
  author =       "Juan Leon and Allan L. Fisher and Peter Alfons
                 Steenkiste",
  title =        "Fail-safe {PVM}: a portable package for distributed
                 programming with transparent recovery",
  institution =  inst-SCS-CMU,
  address =      inst-SCS-CMU:adr,
  pages =        "22",
  year =         "1993",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "[Research paper] / Carnegie Mellon University. School
                 of Computer Science; CMU-CS-93-124 Research paper
                 (Carnegie Mellon University. School of Computer
                 Science); CMU-CS-93-124",
  abstract =     "Many scientific problems benefit from computations
                 that are parallel at a coarse grain. Collections of
                 loosely-coupled, heterogeneous computers are
                 increasingly being applied to these problems. While
                 individual computers are designed to be relatively
                 reliable, a collection of several autonomous machines
                 necessarily has a greater rate of failure. As data
                 networks improve, and larger multicomputers are being
                 used, rates of failure will increase. PVM (Parallel
                 Virtual Machine) [Sun90, GS92] is a popular software
                 framework that facilitates message-passing network
                 programming. We present enhancements to PVM to mask
                 fail-stop, single-node failures from the application.
                 Fail-safe PVM uses checkpoint and rollback to recover
                 from such failures. Both checkpoints and rollbacks are
                 transparent to the application if the application does
                 not depend on real-time events. Recovery occurs without
                 wait for repair of the failed computer. The system does
                 not rely on shared stable storage and does not require
                 modifications to the operating system. We describe the
                 design and implementation of fail-safe PVM, present
                 meassurements [sic] of checkpoint costs, and briefly
                 discuss shortcomings and potential avenues for
                 improvement.'' Supported in part by the Defense
                 Advanced Research Projects Agency, issued by
                 DARPA/CMO.",
  acknowledgement = ack-nhfb,
  annote =       "February 1993.",
  keywords =     "Fault-tolerant computing",
}

@InProceedings{Levesque:1993:SAA,
  author =       "J. M. Levesque and R. Friedman",
  title =        "The state of the art in automatic parallelisation",
  crossref =     "Anonymous:1993:SEC",
  pages =        "95--107",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Appl. Parallel Res. Inc., Placerville, CA, USA",
  classification = "C6110P (Parallel programming)",
  keywords =     "Automatic parallelisation; Data Distribution
                 Directives; Distributed memory; Fortran programs;
                 Parallelization; Shared memory; User assistance",
  thesaurus =    "FORTRAN; Parallel programming",
}

@InProceedings{Lewis:1993:PCP,
  author =       "M. J. Lewis and R. E. {Cline, Jr.}",
  title =        "{PVM} Communication Performance in a Switched {FDDI}
                 Heterogeneous Distributed Computing Environment",
  crossref =     "Bhargava:1993:PIW",
  pages =        "13--19",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sandia Nat. Labs., Livermore, CA, USA",
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C5620L (Local area networks);
                 C5640 (Protocols)",
  corpsource =   "Sandia Nat. Labs., Livermore, CA, USA",
  keywords =     "distributed processing; distributed systems;
                 Distributed systems; FDDI; heterogeneous distributed
                 computing; Heterogeneous distributed computing; local
                 area networks; machines; message; message passing
                 system; Message passing system; parallel; parallel
                 machines; Parallel programs; Parallel Virtual Machine;
                 passing; performance evaluation; programs; PVM;
                 switched FDDI; Switched FDDI; virtual",
  sponsororg =   "IEEE",
  thesaurus =    "Distributed processing; FDDI; Local area networks;
                 Message passing; Parallel machines; Performance
                 evaluation; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Li:1993:MSU,
  author =       "Q. Li and T. G. Yip",
  title =        "Monitoring Systems Using {PVM}",
  crossref =     "Law:1993:EDM",
  pages =        "781--785",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Li:1993:SLL,
  author =       "Q. Li and J.-C. Liu and T. G. Yip",
  title =        "Solving Large Linear Equations Using {PVM} System",
  crossref =     "Law:1993:EDM",
  pages =        "685--690",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Loyot:1993:VVM,
  author =       "E. C. {Loyot, Jr.} and A. S. Grimshaw",
  title =        "{VMPP}: a virtual machine for parallel processing",
  crossref =     "IEEE:1993:PSI",
  pages =        "735--740",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150C (Compilers,
                 interpreters and other processors); C7430 (Computer
                 engineering)",
  corpsource =   "Dept. of Comput. Sci., Virginia Univ.,
                 Charlottesville, VA, USA",
  keywords =     "front-end translators; parallel languages; parallel
                 processing; parallel source languages; portability;
                 program interpreters; software; virtual machine;
                 virtual machines; VMPP",
  sponsororg =   "IEEE Comput. Soc.; ACM Sigarch",
  treatment =    "P Practical",
}

@InProceedings{Maly:1993:DCP,
  author =       "K. Maly and M. Zubair and S. Kelbar",
  title =        "Distributed computing with parallel networking",
  crossref =     "IEEE:1993:PFW",
  pages =        "375--379",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Comput. Sci. Dept., Old Dominion Univ., Norfolk, VA,
                 USA",
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5620L (Local area networks); C5640
                 (Protocols); C5670 (Network performance)",
  keywords =     "Application performance; Communication network;
                 Communication network performance; Dedicated parallel
                 machine; Distributed computing; Ethernet; Parallel
                 networking; Parallel virtual machine environment; PPVM;
                 PVM; Round robin scheduling",
  thesaurus =    "Local area networks; Performance evaluation;
                 Protocols; Scheduling",
}

@Article{Matrone:1993:LPC,
  author =       "A. Matrone and P. Schiano and V. Puoti",
  title =        "{LINDA} and {PVM}: a comparison between two
                 environments for parallel programming",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "19",
  number =       "8",
  pages =        "949--957",
  month =        aug,
  year =         "1993",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Short communication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Centro Italiano Ricerche Aerospaziali, Capua, Italy",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  corpsource =   "Centro Italiano Ricerche Aerospaziali, Capua, Italy",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "Distributed memory machines; distributed memory
                 machines; environments; LINDA; Message passing; message
                 passing; Parallel programming; parallel programming;
                 programming; Programming environments; programming
                 environments; PVM; RISC/6000",
  pubcountry =   "Netherlands",
  thesaurus =    "Parallel programming; Programming environments",
  treatment =    "P Practical",
}

@Article{McKinney:1993:MMI,
  author =       "G. W. McKinney and J. T. West",
  title =        "Multiprocessing {MCNP} on an {IBM RS\slash} 6000
                 cluster",
  journal =      j-TRANS-AM-NUCL-SOC,
  volume =       "68",
  number =       "pt.A",
  pages =        "212--214",
  year =         "1993",
  CODEN =        "TANSAO",
  ISSN =         "0003-018X",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Los Alamos Nat. Lab., NM, USA",
  classification = "A0250 (Probability theory, stochastic processes, and
                 statistics); A0270 (Computational techniques); A0560
                 (Transport processes: theory); A2820H (Neutron
                 diffusion); A2841C (Computer codes); C7320 (Physics and
                 Chemistry); C7470 (Nuclear engineering)",
  fjournal =     "Transactions of the American Nuclear Society",
  keywords =     "Electron transport; IBM RS/6000 cluster; MCNP; Monte
                 Carlo; Multiuser environment; Neutron transport;
                 Parallel Virtual Machine; Photon transport; PVM
                 version; Reduced Instruction Set Computer; Workstation
                 cluster",
  thesaurus =    "Monte Carlo methods; Neutron transport theory; Nuclear
                 engineering computing; Photon transport theory; Physics
                 computing; Transport processes",
}

@Article{Michielse:1993:PMU,
  author =       "P. Michielse",
  title =        "Parallel multigrid using {PVM}",
  journal =      j-SUPERCOMPUTER,
  volume =       "10",
  number =       "6",
  pages =        "10--23",
  month =        "????",
  year =         "1993",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Convex Computer, Utrecht, Netherlands",
  classification = "C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory); C5440
                 (Multiprocessor systems and techniques)",
  corpsource =   "Convex Computer, Utrecht, Netherlands",
  fjournal =     "Supercomputer",
  keywords =     "algorithms; Convex; Convex MetaSeries machines;
                 differential equations; distributed memory systems;
                 Distributed memory systems; MetaSeries machines;
                 Parallel; parallel; parallel machines; parallel
                 multigrid method; Parallel multigrid method; Parallel
                 Virtual Machine; PVM; shared memory systems; Shared
                 memory systems; Virtual Machine; virtual machines",
  pubcountry =   "Netherlands",
  thesaurus =    "Differential equations; Distributed memory systems;
                 Parallel algorithms; Parallel machines; Shared memory
                 systems; Virtual machines",
  treatment =    "P Practical",
}

@Article{Nanayakkara:1993:PIR,
  author =       "A. Nanayakkara and D. Moncrieff and S. Wilson",
  title =        "Performance of {IBM RISC System\slash 6000}
                 workstation clusters in a quantum chemical
                 application",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "19",
  number =       "9",
  pages =        "1053--1062",
  month =        sep,
  year =         "1993",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Supercomputer Comput. Res. Inst., Florida State Univ.,
                 Tallahassee, FL, USA",
  classification = "C5430 (Microcomputers); C5470 (Performance
                 evaluation and testing); C7320 (Physics and
                 Chemistry)",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "Concurrent computation many-body perturbation theory;
                 CRAY Y-MP C-90; Electron correlation energy
                 calculations; IBM RISC System/6000 workstation
                 clusters; NEC SX-3/44 computers; Parallel virtual
                 machine system; Performance",
  pubcountry =   "Netherlands",
  thesaurus =    "Chemistry computing; IBM computers; Performance
                 evaluation; Quantum chemistry; Reduced instruction set
                 computing; Workstations",
}

@Article{Nelson:1993:PPP,
  author =       "M. L. Nelson",
  title =        "{PVM} provides power in the public domain",
  journal =      j-PARALLELOGRAM,
  volume =       "53",
  pages =        "20--21",
  month =        may # "--" # jun,
  year =         "1993",
  CODEN =        "PRALEH",
  ISSN =         "0953-7252",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6150N (Distributed systems); C7430 (Computer
                 engineering)",
  fjournal =     "Parallelogram",
  keywords =     "de; De facto standard; distributed computing;
                 Distributed computing; ease-of-use; Ease-of-use; facto
                 standard; heterogeneous computer network; Heterogeneous
                 computer network; maintenance; Maintenance; message
                 passing; message-passing system; Message-passing
                 system; parallel machines; parallel programming;
                 Parallel virtual machine; performance; Performance;
                 public domain software; Public domain software; PVM;
                 robustness; Robustness; software packages; virtual
                 machines",
  pubcountry =   "UK",
  thesaurus =    "Message passing; Parallel machines; Parallel
                 programming; Public domain software; Software packages;
                 Virtual machines",
  treatment =    "P Practical; R Product Review",
}

@TechReport{Oed:1993:CRM,
  author =       "Wilfried Oed",
  title =        "The {Cray Research} Massively Parallel Processor
                 System {CRAY T3D}",
  institution =  "Cray Research GmbH",
  address =      "M{\"u}nchen, Germany",
  month =        nov # " 15",
  year =         "1993",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/Parallel.io.bib",
  comment =      "A MIMD, shared-memory machine, with 2-processor units
                 embedded in a 3-d torus. Each link is bidirectional and
                 runs 300 MB/s. Processors are 150 MHz ALPHA, plus
                 16--64 MB RAM, plus a memory interface unit. Global
                 physical address space with remote-reference and
                 block-transfer capability. Not clear about cache
                 coherency. Separate tree network for global
                 synchronization. Support for message send and optional
                 interrupt. I/O is all done through interface nodes that
                 hook to the YMP host and to its I/O clusters with 400
                 MB/s links. I/O is by default serialized, but they do
                 support a ``broadcast'' read operation (but see
                 pase:t3d-fortran). FORTRAN compiler supports the NUMA
                 shared memory; PVM is used for C and message passing.",
  keyword =      "parallel architecture, shared memory, supercomputer,
                 parallel I/O, pario bib",
}

@Article{Otto:1993:PAC,
  author =       "S. W. Otto",
  title =        "Parallel array classes and lightweight sharing
                 mechanisms",
  journal =      j-SCI-PROG,
  volume =       "2",
  number =       "4",
  pages =        "203--216",
  month =        "Winter",
  year =         "1993",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci. and Eng., Oregon Graduate Inst.
                 of Sci. and Technol., Beaverton, OR, USA",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6110J (Object-oriented
                 programming); C6110P (Parallel programming); C6120
                 (File organisation); C6150N (Distributed systems
                 software)",
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
  keywords =     "C++; Collective object; Distributed memory;
                 Distributed memory architectures; Finite difference
                 stencils; Finite element method; Intel NX message
                 passing systems; Interpolation/contraction operations;
                 Lightweight sharing mechanisms; Low level message
                 passing; Meaningful array operations; MetaMP; Multigrid
                 algorithms; Parallel array classes; Particle in cell
                 algorithms; Partitioned array; PVM; Shared memory
                 architectures; Transparent guard strips; Weak memory
                 coherence",
  thesaurus =    "Abstract data types; Distributed memory systems;
                 Message passing; Object-oriented programming; Parallel
                 programming",
}

@InProceedings{Parsons:1993:EDC,
  author =       "I. Parsons",
  title =        "Evaluation of distributed communication systems",
  crossref =     "Gawman:1993:PCT",
  pages =        "956--970 vol.2",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Alberta Univ., Edmonton, Alta.,
                 Canada",
  classification = "C0310H (Equipment and software evaluation methods);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150N (Distributed systems
                 software)",
  keywords =     "Balancing act; Communication systems; Concert/C;
                 Distributed communication systems; Distributed parallel
                 programs; Enterprise project; Handcrafted code; ISIS;
                 Network of workstations; NMP; Programming environment;
                 PVM; Software engineers",
  thesaurus =    "Network operating systems; Parallel programming;
                 Program testing; Programming environments; Software
                 selection",
}

@MastersThesis{Patterson:1993:PPE,
  author =       "Christopher S. Patterson",
  title =        "Parametric Positron Emission Tomographic imaging using
                 Parallel Virtual Machine: with an example using
                 Myocardial Blood Flow analysis",
  type =         "M.S. thesis",
  school =       inst-UTK,
  address =      inst-UTK:adr,
  pages =        "x + 132",
  year =         "1993",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Blood flow --- Measurement.; Tomography, Emission.;
                 Virtual computer systems",
}

@InProceedings{Rabenseifner:1993:CDR,
  author =       "R. Rabenseifner and A. Schuch",
  title =        "Comparison of {DCE RPC}, {DFN-RPC}, {ONC} and {PVM}",
  crossref =     "Schill:1993:DOD",
  pages =        "39--46",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Rechenzentrum, Stuttgart Univ., Germany",
  classification = "C5670 (Network performance); C6115 (Programming
                 support); C6150N (Distributed systems software); C6150N
                 (Distributed systems)",
  conflocation = "Karlsruhe, Germany; 7-8 Oct. 1993",
  conftitle =    "International DCE Workshop. DCE --- The OSF
                 Distributed Computing Environment Client/Server Model
                 and Beyond",
  corpsource =   "Rechenzentrum, Stuttgart Univ., Germany",
  keywords =     "account; applications; C; C applications; calls;
                 Capability; capability; Computer server; computer
                 server; DCE RPC; DFN-RPC; Early Participation; Early
                 Participation Program; FORTRAN; Fortran applications
                 distribution; Functionality; functionality; German
                 Research Network; German Research Network Society; IBM
                 computers; IBM RS/6000 workstations; message passing;
                 Message passing library; message passing library;
                 network servers; ONC; open systems; OSF Distributed
                 Computing Environment; Parallelization;
                 parallelization; Performance; performance; performance
                 evaluation; Program; PVM; remote procedure; Remote
                 procedure calls; scientific-technical;
                 Scientific-technical applications; Society; software
                 tools; SUN RPC; System programming tool; system
                 programming tool; systems analysis; Unix; UNIX computer
                 network; user-; User-account; workstations",
  pubcountry =   "Germany",
  thesaurus =    "FORTRAN; IBM computers; Message passing; Network
                 servers; Open systems; Performance evaluation; Remote
                 procedure calls; Software tools; Systems analysis;
                 Unix; Workstations",
  treatment =    "P Practical",
}

@Article{Robinson:1993:ECD,
  author =       "D. F. Robinson and D. Judd and P. K. McKinely and B.
                 H. C. Cheng",
  title =        "Efficient collective data distribution in all-port
                 wormhole-routed hypercubes",
  journal =      j-PROC-SUPERCOMPUT,
  pages =        "792--801",
  month =        "????",
  year =         "1993",
  CODEN =        "????",
  ISBN =         "0-8186-4340-4",
  ISBN-13 =      "978-0-8186-4340-8",
  ISSN =         "1063-9535",
  LCCN =         "QA76.5 .S894 1993",
  bibdate =      "Fri May 24 09:57:40 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper addresses the problem of collective data
                 distribution, specifically multicast, in
                 wormhole-routed hypercubes. The system model allows a
                 processor to send and receive dataa in all dimensions
                 simultaneously. New theoretical results that
                 characterize contention among messages in
                 wormhole-routed hypercubes are developed and used to
                 design new multicast routing algorithms. The algorithms
                 are compared in terms of the number of steps required
                 in each, their measured execution times when
                 implemented on a relatively small-scale nCUBE-2, and
                 their simulated execution times on larger hypercubes.",
  acknowledgement = ack-nhfb,
  affiliation =  "Michigan State Univ",
  affiliationaddress = "East Lansing, MI, USA",
  classification = "723",
  conference =   "Proceedings of the Supercomputing '93 Conference",
  conferenceyear = "1993",
  fjournal =     "Proceedings of the Supercomputing Conference",
  journalabr =   "Proc Supercomputing Conf",
  keywords =     "Algorithms; Message passing interface (MPI); Multicast
                 routing algorithms; Parallel processing systems;
                 Small-scale nCUBE-2; Wormhole-routed hypercubes",
  meetingaddress = "Portland, OR, USA",
  meetingdate =  "Nov 15--19 1993",
  meetingdate2 = "11/15--19/93",
  publisherinfo = "Computer Society Press",
  sponsor =      "IEEE Computer Society; ACM SIGARCH",
}

@MastersThesis{Sept:1993:DIP,
  author =       "Doug Sept",
  title =        "The design, implementation and performance of a queue
                 manager for {PVM}",
  type =         "M.S. thesis",
  school =       "Computer Science Department, " # inst-UTK,
  address =      inst-UTK:adr,
  pages =        "viii + 45",
  year =         "1993",
  bibdate =      "Mon Jan 15 18:16:36 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Technical report CS-93-196: University of Tennessee,
                 Knoxville, Computer Science Department",
  abstract =     "The PVM Queue Manager (QM) application addresses some
                 of the load balancing problems associated with the
                 heterogeneous, multi-user, computing environments for
                 which PVM was designed. In such environments, PVM is
                 not only confronted with the difficulties of
                 distributing tasks among machines of variable loads, it
                 must also contend with machines of varying performance
                 levels in the same virtual machine. The QM addresses
                 both of these problems using two different load
                 balancing techniques, one static, the other dynamic. In
                 its simplest (static) mode, the QM will initiate PVM
                 processes for the user on demand, taking into account
                 information such as the peak megaflops/sec and actual
                 load of each machine. In addition to the initiation of
                 processes, the QM will also accept tasks to be
                 completed by a specified PVM process type. These tasks
                 are shipped to the QM where they are kept in a FIFO
                 queue. Worker processes in the virtual machine send
                 idle messages to the QM when they are ready for a task,
                 and the QM ships a task to the process if there is one
                 (of a type matching the process) in the queue. The QM
                 also maintains a list of idle processes and chooses the
                 best one for the task, should one arrive when several
                 processes are idle. Since faster machines typically
                 send more idle messages (and receive more tasks) than
                 slower ones, this provides a level of dynamic load
                 balancing for the system. Three applications have
                 already been implemented using the QM within PVM: a
                 Mandelbrot image generator, a conjugate-gradient
                 algorithm, and a map analysis program used in landscape
                 ecology applications. Benchmarks of elapsed wall-clock
                 time comparing standard PVM versions with the QM-based
                 versions demonstrate substantial performance gains for
                 both methods of load balancing. When processing a 1000
                 x 1000 image, for example, the QM-based Mandelbrot
                 application averaged 63.92 seconds, compared to 139.62
                 seconds for the standard PVM version in a heterogeneous
                 [sic] network of five workstations (comprised of Sun4's
                 and IBM RS/6000).",
  acknowledgement = ack-nhfb,
  keywords =     "Parallel computers.; Queuing theory; Virtual computer
                 systems",
}

@InProceedings{Simonsen:1993:DMD,
  author =       "H. H. Simonsen and J. Amundsen",
  title =        "Distributed Molecular Dynamics Using the {PVM}
                 System",
  crossref =     "Sincovec:1993:SCP",
  pages =        "183--186",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Skjellum:1993:SLH,
  author =       "A. Skjellum",
  title =        "Scalable libraries in a heterogeneous environment",
  crossref =     "IEEE:1993:PIS",
  pages =        "13--20",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  classification = "C5440 (Multiprocessor systems and techniques); C6155
                 (Computer communications software)",
  keywords =     "Communicating processes; Communication contexts;
                 Heterogeneous environment; Heterogeneous network
                 environment; Message-passing features; MPI standard;
                 Multicomputer libraries; Multicomputer toolbox
                 first-generation scalable libraries; Scalable
                 libraries; User program; Zipcode",
  thesaurus =    "Computer communications software; Message passing;
                 Multiprocessing systems",
}

@Article{Smith:1993:DSI,
  author =       "S. L. Smith",
  title =        "Dynamic scheduling of irregularly structured parallel
                 computations in heterogeneous distributed systems",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "1",
  pages =        "86",
  month =        jan,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "CERFACS, Toulouse, France",
  classification = "C6150N (Distributed systems)",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Dynamic centralized scheduling; Global optimization;
                 Heterogeneous computing environments; Heterogeneous
                 distributed systems; Irregularly structured parallel
                 computations; Parallel algorithm; Parallel virtual
                 machine; Performance evaluation; PVM environment;
                 Simulation",
  thesaurus =    "Distributed processing; Parallel programming;
                 Scheduling",
}

@InProceedings{Smith:1993:MBA,
  author =       "K. A. Smith",
  title =        "Multi-Processor Based Accident Using {PVM}",
  crossref =     "Sincovec:1993:SCP",
  pages =        "262--265",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sochacki:1993:DCW,
  author =       "J. S. Sochacki and D. Mitchum and P. O'Leary and R. E.
                 Ewing",
  title =        "Distributed Computation of Wave Propagation Models
                 Using {PVM}",
  crossref =     "IEEE:1993:PSP",
  pages =        "22--33",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sunderam:1993:PCC,
  author =       "V. Sunderam",
  title =        "The {PVM} Concurrent Computing System",
  crossref =     "Anonymous:1993:CDP",
  pages =        "20--84",
  year =         "1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{vanderPas:1993:PIG,
  author =       "R. {van der Pas}",
  title =        "The {PVM} implementation of a {Generalized Red Black}
                 algorithm",
  journal =      j-SUPERCOMPUTER,
  volume =       "10",
  number =       "4-5",
  pages =        "72--85",
  month =        jul # "--" # sep,
  year =         "1993",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Convex Computer, Utrecht, Netherlands",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessor systems and techniques); C5470
                 (Performance evaluation and testing)",
  corpsource =   "Convex Computer, Utrecht, Netherlands",
  fjournal =     "Supercomputer",
  keywords =     "Convex Meta Series; EuroBen benchmark; evaluation;
                 Generalized Red Black algorithm; module MOD3H; Module
                 MOD3H; multiprocessing systems; parallel algorithms;
                 performance; performance measurements; Performance
                 measurements; performance numbers; Performance numbers;
                 Poisson; Poisson solver; PVM implementation; solver",
  pubcountry =   "Netherlands",
  thesaurus =    "Multiprocessing systems; Parallel algorithms;
                 Performance evaluation",
  treatment =    "P Practical",
}

@PhdThesis{Wilkinson:1993:IFT,
  author =       "Timothy James Wilkinson",
  title =        "Implementing Fault Tolerance in a 64-bit Distributed
                 Operating System",
  school =       "Systems Architecture Research Centre, City
                 University",
  address =      "London, UK",
  month =        jul,
  year =         "1993",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Misc/sasos.bib",
  abstract =     "This thesis explores the potential of 64-bit
                 processors for providing a different style of
                 distributed operating system. Rather than providing
                 another reworking of the UNIX model, the use of the
                 large address space for unifying volatile memory
                 (virtual memory), persistent memory (file systems) and
                 distributed network access is examined and a novel
                 operating system, Arius, is proposed. The concepts
                 behind the design of Arius are briefly reviewed, and
                 then the reliability of such a system is examined in
                 detail. The unified nature of the architecture makes it
                 possible to use a reliable single address space to
                 provide a completely reliable system without the
                 addition of other mechanisms. Protocols are proposed to
                 provide locally scalable distributed shared memory and
                 these are then augmented to handle machine failures
                 transparently though the use of distributed checkpoints
                 and rollback. The checkpointing system makes use of the
                 caching mechanism in DSM to provide data duplication
                 for failure recovery. By using distributed memory for
                 checkpoints, recovery from machine faults may be
                 handled seamlessly. To cope with more ``complete''
                 failures, persistent storage is also included in the
                 failure mechanism. These protocols are modelled to show
                 their operability and to determine the cost they incur
                 in various types of parallel and serial programs.
                 Results are presented to demonstrate these costs.",
}

@InProceedings{Young:1993:PEN,
  author =       "Y.-H. Young and K. Sikorski",
  title =        "Performance evaluation of network programming
                 environments",
  crossref =     "Mudge:1993:PTS",
  pages =        "106--107 (vol. 2)",
  year =         "1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Utah Univ., Salt Lake City, UT,
                 USA",
  classification = "C5620 (Computer networks and techniques); C5670
                 (Network performance); C6115 (Programming support);
                 C6150G (Diagnostic, testing, debugging and evaluating
                 systems)",
  keywords =     "Benchmark tests; EXPRESS; ISIS; Jacobi iterative
                 algorithms; Library support; LINDA; Message passing;
                 Monte Carlo simulation; Network programming
                 environments; Performance evaluation; PVM; Scalability;
                 TCGMSG; TCP/IP network protocol; Token ring network;
                 UDP/IP network protocol",
  thesaurus =    "Computer networks; Message passing; Monte Carlo
                 methods; Performance evaluation; Programming
                 environments; Protocols",
}

@InProceedings{Zollweg:1993:OP,
  author =       "J. A. Zollweg",
  title =        "Overview of {PVM}",
  crossref =     "Anonymous:1993:PSE",
  pages =        "981--986",
  year =         "1993",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Cornell Nat. Supercomput. Facility, NY, USA",
  classification = "C5640 (Protocols); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C6150N
                 (Distributed systems); C7320 (Physics and chemistry
                 computing); C7320 (Physics and Chemistry)",
  corpsource =   "Cornell Nat. Supercomput. Facility, NY, USA",
  keywords =     "frequent communication; Frequent communication; high;
                 High performance switch; message passing;
                 message-passing environment; Message-passing
                 environment; parallel programming; Parallel Virtual
                 Machine; performance switch; physics computing;
                 protocols; PVM package; scalable POWERparallel system;
                 Scalable POWERparallel system; scientific application;
                 Scientific application; software packages; TCP/IP
                 communication; virtual machines; workstations;
                 Workstations",
  pubcountry =   "Switzerland",
  thesaurus =    "Message passing; Parallel programming; Physics
                 computing; Protocols; Software packages; Virtual
                 machines",
  treatment =    "G General Review; P Practical",
}

@InProceedings{Altas:1994:NIE,
  author =       "I. Altas and M. Rezny and J. Louis and K. Burrage and
                 R. Moore and J. Belward",
  title =        "A new image enhancement algorithm on {MasPar} and
                 {Parallel Virtual Machine} ({PVM}) environments",
  crossref =     "Dekker:1994:MPP",
  pages =        "819--826",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Centre for Image Anal., Charles Sturt Univ., Wagga
                 Wagga, NSW, Australia",
  classification = "C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory); C5260B (Computer
                 vision and image processing techniques); C6110P
                 (Parallel programming)",
  corpsource =   "Centre for Image Anal., Charles Sturt Univ., Wagga
                 Wagga, NSW, Australia",
  keywords =     "computation environment; equations; fine grain; Fine
                 grain computation environment; image enhancement; image
                 enhancement algorithm; Image enhancement algorithm;
                 Machine; MasPar; minimisation; optimal image
                 enhancement; Optimal image enhancement; parallel
                 algorithms; Parallel Virtual; Parallel Virtual Machine;
                 partial differential; partial differential equations;
                 Partial differential equations; processing time;
                 Processing time; variational; Variational
                 minimisation",
  pubcountry =   "Netherlands",
  sponsororg =   "AKZO NOBEL; BSO; Convex Comput.; HPCN projects; IBM;
                 NOWESP; et al",
  thesaurus =    "Image enhancement; Parallel algorithms; Partial
                 differential equations",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Alund:1994:CFD,
  author =       "A. Alund and P. Lotstedt and R. Ryden",
  title =        "Computational fluid dynamics on workstation clusters
                 in industrial environments",
  crossref =     "Dongarra:1994:PSC",
  pages =        "1--10",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Swedish Inst. of Appl. Math., Goteborg, Sweden",
  classification = "C4100 (Numerical analysis); C5620L (Local area
                 networks); C6110P (Parallel programming); C6150N
                 (Distributed systems software); C7460 (Aerospace
                 engineering computing)",
  keywords =     "3D Navier--Stokes code; ABB Corporate Research;
                 Combustion chambers; Compressible flow; Computational
                 fluid dynamics; CRAY Y-MP processor; Ethernet;
                 Industrial environments; Industrial production codes;
                 Multigrid method; Numerical simulations;
                 Parallelisation; PVM message passing system; SAAB
                 Military Aircraft; SGI R4000 workstations; Stationary
                 Euler equations; Stationary Navier--Stokes equations;
                 Swedish Institute of Applied Mathematics; Turbulent
                 flow; Volvo Flygmotor; Workstation clusters",
  thesaurus =    "Aerospace computing; Chemically reactive flow;
                 Combustion; Compressible flow; Engineering
                 workstations; Flow simulation; Local area networks;
                 Navier--Stokes equations; Numerical analysis; Parallel
                 programming; Turbulence",
}

@InProceedings{Amato:1994:PEP,
  author =       "M. Amato and A. Matrone and P. Schiano",
  title =        "A practical experience in parallelizing a large {CFD}
                 code: the {ENSOLV} flow solver",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "508--513",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Centro Italiano Ricerche Aerospaziala, Capua, Italy",
  classification = "A4710 (General fluid dynamics theory, simulation and
                 other computational methods); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming); C6140D (High
                 level languages); C6150N (Distributed systems
                 software); C7320 (Physics and chemistry computing)",
  keywords =     "30000-Fortran-statements code; 3D Thin Layer
                 Navier--Stokes code; Complex aerodynamic configuration;
                 Computational fluid dynamics; Data transmission; ENSOLV
                 flow solver; Large CFD code; Message passing; MIMD
                 machines; Multidisciplinary group; Parallel
                 architectures; Parallel implementation; Practical
                 experience; PVM; Subsonic/transonic flow",
  thesaurus =    "Aerodynamics; FORTRAN; Message passing; Navier--Stokes
                 equations; Parallel machines; Parallel programming;
                 Physics computing",
}

@InProceedings{Andersen:1994:PIA,
  author =       "B. S. Andersen and P. Kaae and C. Keable and W.
                 Owczarz and J. Wasniewski and Z. Zlatev",
  title =        "{PVM} Implementations of Advection-Chemistry Modules
                 of Air Pollution Models",
  crossref =     "Dongarra:1994:PSC",
  pages =        "11--16",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Tech. Univ. Denmark, Lyngby, Denmark",
  classification = "C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems software); C7320 (Physics and
                 chemistry computing); C7340 (Geophysics computing)",
  corpsource =   "Tech. Univ. Denmark, Lyngby, Denmark",
  keywords =     "advection-chemistry modules; Advection-chemistry
                 modules; air pollution; air pollution models; Air
                 pollution models; Air pollution reduction; atmospheric
                 chemistry; chemical transformations; Chemical
                 transformations; chemistry computing; environmental
                 science computing; evaluation; geophysical fluid
                 dynamics; implementations; mathematical models;
                 Mathematical models; module testing; Module testing;
                 parallel programming; performance; Performance; program
                 testing; PVM; PVM implementations; PVM program;
                 reduction; software performance; transport; Transport;
                 virtual machines; wind; Wind",
  pubcountry =   "Germany",
  sponsororg =   "Danish Comput. Centre for Res. and Educ.; Inst. Math.
                 Modelling; Tech. Univ. Denmark",
  thesaurus =    "Air pollution; Atmospheric chemistry; Chemistry
                 computing; Environmental science computing; Geophysical
                 fluid dynamics; Parallel programming; Program testing;
                 Software performance evaluation; Virtual machines;
                 Wind",
  treatment =    "P Practical",
}

@InProceedings{Anonymous:1994:ALM,
  author =       "Anonymous",
  title =        "Adaptive Load Migration Systems for {PVM}",
  crossref =     "IEEE:1994:PSW",
  pages =        "390--399",
  year =         "1994",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Anonymous:1994:MMP,
  author =       "Anonymous",
  title =        "{MPI}: a message-passing interface standard",
  journal =      j-IJSAHPC,
  volume =       "8",
  number =       "3/4",
  pages =        "159--416",
  month =        "Fall-Winter",
  year =         "1994",
  CODEN =        "IJSAE9",
  ISSN =         "0890-2720",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Message passing is a paradigm used widely on certain
                 classes of parallel machines, especially these with
                 distributed memory. Although there are many variations,
                 the basic concept of processes communicating through
                 messages is well understood. Over the last ten years,
                 substantial progress has been made in casting
                 significant applications in this paradigm. Each vendor
                 has implemented its own variant. More recently, several
                 systems have demonstrated that a message-passing system
                 can be efficiently and portably implemented. A
                 definition of both the syntax and semantics of a core
                 of library routines is thus presented. It will be
                 useful to a wide range of users and efficiently
                 implementable on a wide range of computers.",
  acknowledgement = ack-nhfb,
  classification = "722.2; 722.3; 722.4; 723.1; 723.1.1; C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6140D (High level languages); C6150N
                 (Distributed systems software)",
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
  keywords =     "C; C (programming language); C language; Codes
                 (standards); Collective communication; collective
                 communication; Computational linguistics; Computer
                 software; Conventions; conventions; Data communication
                 systems; FORTRAN; FORTRAN (programming language);
                 Fortran 77; Fortran bindings; Interfaces (computer);
                 Language binding; language binding; message passing;
                 Message passing interface; Message Passing Interface;
                 Message-passing interface standard; message-passing
                 interface standard; MPI environmental management; MPI
                 function; MPI function index; MPI terms; Name-shifting
                 convention; name-shifting convention; Parallel
                 machines; Parallel processing systems; parallel
                 programming; Point to point communication;
                 Point-to-point communication; point-to-point
                 communication; Process group collective communication
                 operations; process group collective communication
                 operations; Process topologies; process topologies;
                 Profiling interface; profiling interface; Programmer;
                 programmer; Standardization; standards; Topological
                 structures; topological structures; Unique
                 communication contexts; unique communication contexts;
                 Utility functions; utility functions",
  thesaurus =    "C language; FORTRAN; Message passing; Parallel
                 programming; Standards",
  treatment =    "P Practical",
}

@InProceedings{Antonuccio-Delogu:1994:PTN,
  author =       "V. Antonuccio-Delogu and U. Becciani",
  title =        "A parallel tree {N-body} code for heterogeneous
                 clusters",
  crossref =     "Dongarra:1994:PSC",
  pages =        "17--32",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Catania Astrophys. Obs., Italy",
  classification = "C1160 (Combinatorial mathematics); C4240P (Parallel
                 programming and algorithm theory); C5620L (Local area
                 networks); C6110B (Software engineering techniques);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems software); C7320 (Physics and chemistry
                 computing); C7350 (Astronomy and astrophysics
                 computing)",
  keywords =     "Cantania Astrophysical Observatory; Ethernet; F77
                 version; Generic situations; Heterogeneous clusters;
                 Heterogeneous workstation collection; Hypercube
                 communication pattern; Incomplete hypercube;
                 Mini-supercomputer; Orthogonal recursive bisection
                 oct-tree scheme; Parallel Barnes--Hut 3D N-body tree
                 algorithm; Parallel tree N-body code; Parallelization
                 scheme; Processing units; PVM 3.2.5; Software
                 environment; SPMD paradigm",
  thesaurus =    "Astronomy computing; Hypercube networks; Local area
                 networks; Octrees; Parallel algorithms; Parallel
                 programming; Physics computing; Software performance
                 evaluation; Software portability; Virtual machines;
                 Workstations",
}

@Article{Averbuch:1994:PES,
  author =       "A. Averbuch and E. Gabber and S. Itzikowitz and B.
                 Shoham",
  title =        "On the parallel elliptic single\slash multigrid
                 solutions about aligned and nonaligned bodies using the
                 {Virtual Machine for Multiprocessors}",
  journal =      j-SCI-PROG,
  volume =       "3",
  number =       "1",
  pages =        "13--32",
  month =        "Spring",
  year =         "1994",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4170 (Differential equations); C5440
                 (Multiprocessing systems); C6110B (Software engineering
                 techniques); C6110P (Parallel programming); C6150N
                 (Distributed systems software)",
  corpsource =   "Sch. of Math. Sci., Tel Aviv Univ., Israel",
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
  keywords =     "algorithm; aligned bodies; alignment overhead;
                 architectures; coherent services; distributed memory
                 multiprocessor; efficient programming; elliptic
                 equations; explicitly parallel application programs;
                 for Multi-Processors software package; grid points;
                 memory multiprocessors; memory systems; message
                 passing; MIMD; MOS; multi-user shared memory
                 multiprocessors; multiprocessors; nonaligned bodies;
                 parallel architectures; parallel elliptic; parallel
                 elliptic multigrid solutions; parallel program writing;
                 parallel programming; partial differential equations;
                 performance; portable programming; Sequent Symmetry;
                 shared; single grid solution; single-user shared;
                 software packages; software portability; transputer
                 network; transputer systems; Virtual Machine",
  treatment =    "P Practical",
}

@InProceedings{Aversa:1994:PSH,
  author =       "R. Aversa and N. Mazzocca and U. Villano",
  title =        "{PS}: a simulator for heterogeneous computing
                 environments",
  crossref =     "Dekker:1994:MPP",
  pages =        "335--343",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartimento di Inf. e Sistemistica, Naples Univ.,
                 Italy",
  classification = "C6150N (Distributed systems software)",
  keywords =     "Distributed applications; Heterogeneous computing
                 environment simulator; Performance analysis;
                 Performance index accuracy; PS; PVM run-time system;
                 PVM Simulator; Simulation environment; Simulator
                 architecture",
  thesaurus =    "Parallel processing; Software performance evaluation",
}

@InProceedings{Bachem:1994:PCT,
  author =       "A. Bachem and W. Hochst{\"a}ttler and M. Malich",
  title =        "Simulated Trading --- a New Parallel Approach For
                 Solving Vehicle Routing Problems",
  crossref =     "Joubert:1994:PCT",
  pages =        "471--475",
  year =         "1994",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Techreports/ZPR.Koeln.bib",
  annote =       "We present a parallel improvement heuristic for
                 solving vehicle routing problems with additional
                 constraints. The algorithm was implemented on a
                 parallel transputer machine and on a cluster of
                 workstations using PVM. The computational results
                 obtained with sequential and parallel Simulated Trading
                 show that our approach is superior compared to all
                 heuristics known to the authors by now.",
  crindex =      "29k,6,zpr92-125.ps.gz",
}

@Article{Bala:1994:IEU,
  author =       "V. Bala and J. Bruck and R. Bryant and R. Cypher and
                 P. {De Jong}",
  title =        "The {IBM} external user interface for scalable
                 parallel systems",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "4",
  pages =        "445--??",
  month =        apr,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@InProceedings{Ballico:1994:PSP,
  author =       "M. Ballico and H. Lederer",
  title =        "{Plasmafusionsforschung: Serielles und paralleles
                 Rechnen mit nur einem Programmcode auf Cray YMP,
                 nCUBE2, Workstations mit PVM und KSR1}",
  crossref =     "Anonymous:1994:FWR",
  pages =        "232--234",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Baltas:1994:CPC,
  author =       "N. D. Baltas and C. S. van den Berghe",
  title =        "Comparison of the porting of a computational fluid
                 dynamics application to {SIMD} and {MIMD} computers",
  crossref =     "Dekker:1994:MPP",
  pages =        "761--767",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "CHAM, London, UK",
  classification = "C6110B (Software engineering techniques); C6150N
                 (Distributed systems software); C7320 (Physics and
                 chemistry computing)",
  keywords =     "Computational fluid dynamics; DAP; ESPRlT III project;
                 Fortran-Plus; Maintainability; Massively parallel
                 architectures; Massively parallel computers;
                 Message-passing libraries; MIMD; MIMD Parsytec;
                 Parallel Software-Hardware Application; PARIY; PARMACS;
                 Parsytec model; PASHA; PHOENICS; Porting; Programming
                 models; PVM; Scalable code; SIMD; SIMD DAP",
  thesaurus =    "Message passing; Parallel architectures; Parallel
                 programming; Physics computing; Research initiatives;
                 Software maintenance; Software portability",
}

@InProceedings{Beguelin:1994:CMS,
  author =       "A. Beguelin and B. Bruegge",
  title =        "A configurable monitoring system for parallel
                 programming",
  crossref =     "IEEE:1994:PSI",
  pages =        "206",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C0310F (Software development management); C5440
                 (Multiprocessor systems and techniques); C6110B
                 (Software engineering techniques); C6110P (Parallel
                 programming); C6150G (Diagnostic, testing, debugging
                 and evaluating systems)",
  keywords =     "BEE++; Configurable distributed monitoring system;
                 Distributed programs; Heterogeneous systems; Message
                 passing system; Parallel program debugging; Parallel
                 programming; Parallel virtual machine; PVM",
  thesaurus =    "Configuration management; Message passing; Parallel
                 programming; Program debugging; System monitoring",
}

@Article{Beguelin:1994:HHN,
  author =       "A. Beguelin and J. J. Dongarra and G. Al Geist and R.
                 Manchek and K. Moore",
  title =        "{HeNCE}: a heterogeneous network computing
                 environment",
  journal =      j-SCI-PROG,
  volume =       "3",
  number =       "1",
  pages =        "49--60",
  month =        "Spring",
  year =         "1994",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C1160 (Combinatorial mathematics); C5620L (Local
                 area networks); C6110P (Parallel programming); C6115
                 (Programming support); C6150C (Compilers, interpreters
                 and other processors); C6150G (Diagnostic, testing,
                 debugging and evaluating systems); C6150N (Distributed
                 systems software)",
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
  keywords =     "Aggregate resources; Compilation; Data dependencies;
                 Data formats; Debugging; Directed graphs; Execution;
                 Graph nodes; Graphical language; HeNCE; Heterogeneous
                 network computing environment; Integrated graphical
                 environment; Local area network; Network computation;
                 Networked computers; Operating systems; Parallel
                 programs; Parallel virtual machine; Parallelism;
                 Supercomputer performance; Tracing; Writing",
  thesaurus =    "Directed graphs; Local area networks; Parallel
                 programming; Parallelising compilers; Program
                 debugging; Virtual machines",
}

@InProceedings{Beletsky:1994:OPV,
  author =       "V. Beletsky and T. Popova and A. Chemeris",
  title =        "Organization of a parallel virtual machine",
  crossref =     "Horiguchi:1994:ISP",
  pages =        "421--426",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6150N (Distributed systems
                 software); C7430 (Computer engineering)",
  corpsource =   "Lab. of Parallel Comput., Acad. of Sci., Kiev,
                 Ukraine",
  keywords =     "compilers; dependence graph building; loop
                 parallelization; parallel architectures; parallel
                 machines; parallel virtual machine organization;
                 processor; program compilers; scheduling; scheduling
                 job programs; simulating programs; simulation; virtual
                 machines",
  sponsororg =   "Japan Advanced Inst. Sci. and Technol.; IEEE Comput.
                 Soc.; IEEE Comput. Soc. Tech. Committee on Comput.
                 Archit.; IEEE Comput. Soc. Tech. Committee on Parallel
                 Process.; IPSJ Tech. Committee on Algorithms; IPSJ
                 Tech. Committee on Comput. Archit.; IEICE Tech.
                 Committee on Comput. Syst",
  treatment =    "P Practical",
}

@MastersThesis{Biradar:1994:ADL,
  author =       "Umesh V. Biradar",
  title =        "Adaptive distributed load balancing model for parallel
                 virtual machine",
  type =         "Master of Science in Computer Science",
  school =       "Department of Computer Science, College of
                 Engineering, Lamar University",
  address =      "Beaumont, TX, USA",
  pages =        "viii + 44",
  year =         "1994",
  bibdate =      "Mon Jan 15 18:16:39 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Distributed operating systems (Computers); Electronic
                 data processing --- Distributed processing.;
                 Multiprocessors; Parallel processing (Electronic
                 computers)",
}

@TechReport{Bischof:1994:CSM,
  author =       "Christian Bischof and Institute for Defense Analyses",
  title =        "A Case study of {MPI}: portable and efficient
                 libraries",
  type =         "Technical report",
  number =       "SRC-TR-94-130",
  institution =  "Supercomputing Research Center: IDA",
  address =      "Lanham, MD, USA",
  pages =        "6",
  year =         "1994",
  bibdate =      "Sat Feb 24 09:43:12 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "``In this paper, we discuss the performance achieved
                 by several implementations of the recently defined
                 Message Passing Interface (MPI) standard. In
                 particular, performance results for different
                 implementations of the broadcast operation are analyzed
                 and compared on the Delta, Paragon, SP1 and CM5.''
                 Supported in part by the Applied and Computational
                 Mathematics Program, Advanced Research Projects Agency.
                 Supported in part by the Office of Scientific
                 Computing, U.S. Department of Energy.",
  acknowledgement = ack-nhfb,
  annote =       "This paper will appear in the proceedings of the
                 Seventh SIAM conference on Parallel Processing for
                 Scientific Computing, September 15, 1994.",
  keywords =     "Parallel processing (Electronic computers)",
}

@InProceedings{Boerger:1994:FSP,
  author =       "E. Boerger and U. Glaesser",
  title =        "A Formal Specification of the {PVM} Architecture",
  crossref =     "Pehrson:1994:IPP",
  pages =        "402--409",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Borger:1994:AMP,
  author =       "E. Borger and U. Glasser",
  title =        "An abstract model of the {Parallel Virtual Machine}
                 ({PVM})",
  crossref =     "Anonymous:1994:PDC",
  pages =        "308--309",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software)",
  corpsource =   "Dipartimento di Inf., Pisa Univ., Italy",
  keywords =     "(computers); abstract model; abstraction level;
                 algebraic specification; architecture; distributed
                 memory computer; distributed memory systems; formal;
                 heterogeneous distributed computing; message passing;
                 operating systems; Parallel Virtual Machine; PVM;
                 specification; user view; virtual machines",
  sponsororg =   "Int. Soc. Comput. and Their Appl.-ISCA; IEEE; Nat.
                 Supercomput. Centre for Energy and Environ.; Northern
                 Telecom; CRAY Res",
  treatment =    "P Practical",
}

@Article{Borger:1994:FSP,
  author =       "E. Borger and U. Glasser",
  title =        "A formal specification of the {PVM} architecture",
  journal =      j-IFIP-TRANS-A,
  volume =       "A-51",
  pages =        "402--409",
  month =        "????",
  year =         "1994",
  CODEN =        "ITATEC",
  ISSN =         "0926-5473",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartimento di Inf., Pisa Univ., Italy",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessing systems); C5440 (Multiprocessor
                 systems and techniques); C6110B (Software engineering
                 techniques); C6150N (Distributed systems software);
                 C6150N (Distributed systems)",
  conflocation = "Hamburg, Germany; 28 Aug.-2 Sept. 1994",
  conftitle =    "Technology and Foundations Information Processing '94.
                 IFIP 13th World Computer Congress",
  corpsource =   "Dipartimento di Inf., Pisa Univ., Italy",
  fjournal =     "IFIP Transactions. A. Computer Science and
                 Technology",
  keywords =     "concurrent evolving algebras; Concurrent evolving
                 algebras; data structures; Data structures; distributed
                 computing; distributed processing; formal
                 specification; Formal specification; heterogeneous;
                 Heterogeneous distributed computing; Machine; message;
                 message passing; Message passing; parallel machines;
                 Parallel Virtual; Parallel Virtual Machine; passing;
                 PVM architecture; virtual machines",
  pubcountry =   "Netherlands",
  thesaurus =    "Distributed processing; Formal specification; Message
                 passing; Parallel machines; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Boryczko:1994:LGA,
  author =       "K. Boryczko and M. Bubak and J. Kitowski and J.
                 Moscinski and R. Slota",
  title =        "Lattice gas automata and molecular dynamics on a
                 network of computers",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "177--180",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. Inf., Krakow, Poland",
  classification = "A0340G (Fluid dynamics: general mathematical
                 aspects); A0550 (Lattice theory and statistics; A4710
                 (General fluid dynamics theory, simulation and other
                 computational methods); C4240P (Parallel programming
                 and algorithm theory); C5620L (Local area networks);
                 C6110P (Parallel programming); C7320 (Physics and
                 chemistry computing); Ising problems)",
  keywords =     "Computer network; CONVEX C3210; Express; Fluid flow
                 simulation; Lattice gas automata; Molecular dynamics;
                 Network Linda; P4; Parallel algorithms; Parallel
                 programs; PVM; Workstation network",
  thesaurus =    "Automata theory; Digital simulation; Flow simulation;
                 Hydrodynamics; Lattice gas; Local area networks;
                 Parallel algorithms; Parallel programming; Physics
                 computing; Workstations",
}

@InProceedings{Briley:1994:NNH,
  author =       "W. R. Briley and D. S. Reese and A. Skjellum and L. H.
                 Turcotte",
  title =        "{NHPDCC}: The {National High Performance Distributed
                 Computing Consortium}",
  crossref =     "IEEE:1994:PSP",
  pages =        "2--9",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "NSF Eng. Res. Center for Comput. Field Simulation,
                 Mississippi State Univ., MS, USA",
  classification = "C0200 (General computer topics); C6150N (Distributed
                 systems software)",
  keywords =     "Benchmarks; Consortium; High performance computing;
                 MPI message-passing; Multi-vendor; National High
                 Performance Distributed Computing Consortium; NHPDCC;
                 Scalable parallel libraries; Software projects",
  thesaurus =    "Distributed processing; Societies",
}

@InProceedings{Bubak:1994:EMD,
  author =       "M. Bubak and J. Moscinski and M. Pogoda and W.
                 Zdechlikiewicz",
  title =        "Efficient molecular dynamics simulation on networked
                 workstations",
  crossref =     "Gruber:1994:PJE",
  pages =        "191--194",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci, Cracow, Poland",
  classification = "A0260 (Numerical approximation and analysis); A6120J
                 (Computer simulation of static and dynamic liquid
                 behaviour); C4240C (Computational complexity); C4240P
                 (Parallel programming and algorithm theory); C6110P
                 (Parallel programming); C6150N (Distributed systems
                 software); C6185 (Simulation techniques); C7320
                 (Physics and chemistry computing)",
  keywords =     "Computational complexity; Efficient molecular dynamics
                 simulation; Large particle numbers; Link lists;
                 Neighbor lists; Networked workstations; Parallel 2D
                 molecular dynamics program; Parallel MD algorithm; PVM
                 programming environment; Sequential 2D molecular
                 dynamics program",
  thesaurus =    "Computational complexity; Digital simulation; List
                 processing; Local area networks; Molecular dynamics
                 method; Parallel algorithms; Parallel programming;
                 Physics computing; Workstations",
}

@InProceedings{Bubak:1994:FLG,
  author =       "M. Bubak and J. Moscinski and R. Slota",
  title =        "{FHP} lattice gas on networked workstations",
  crossref =     "Gruber:1994:PJE",
  pages =        "427--430",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Control Sci., Cracow, Poland",
  classification = "A0550 (Lattice theory and statistics; A4710 (General
                 fluid dynamics theory, simulation and other
                 computational methods); C4240P (Parallel programming
                 and algorithm theory); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C6185
                 (Simulation techniques); C7320 (Physics and chemistry
                 computing); Ising problems)",
  keywords =     "Computer power; Domain decomposition; Dynamic load
                 balancing; FHP lattice gas; Fluid flows; Lattice gas
                 simulation; Networked workstations; Normal load;
                 Parallel algorithm; Parallel distributed program; PVM",
  thesaurus =    "Digital simulation; Flow simulation; Fluid dynamics;
                 Lattice gas; Local area networks; Operating systems
                 [computers]; Parallel algorithms; Parallel programming;
                 Physics computing; Resource allocation; Workstations",
}

@InProceedings{Bubak:1994:IPL,
  author =       "M. Bubak and J. Moscinski and R. Slota",
  title =        "Implementation of Parallel Lattice Gas Program on
                 Workstations under {PVM}",
  crossref =     "Dongarra:1994:PSC",
  pages =        "136--146",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci, Akademia Gorniczo-Hutnicza,
                 Cracow, Poland",
  classification = "A0270 (Computational techniques); A0545 (Theory and
                 models of chaotic systems); A0550 (Lattice theory and
                 statistics; A4710 (General fluid dynamics theory,
                 simulation and other computational methods); C4220
                 (Automata theory); C4240P (Parallel programming and
                 algorithm theory); C6110P (Parallel programming); C7320
                 (Physics and chemistry computing); Ising problems)",
  conflocation = "Lyngby, Denmark; 20-23 June 1994",
  conftitle =    "Proceedings of Workshop on Parallel Scientific
                 Computing",
  corpsource =   "Inst. of Comput. Sci, Akademia Gorniczo-Hutnicza,
                 Cracow, Poland",
  keywords =     "algorithms; balancing; cellular automata; domain
                 decomposition; Domain decomposition; dynamic load;
                 Dynamic load balancing; flow simulation; fluid flow
                 simulation; Fluid flow simulation; lattice gas; lattice
                 gas automata program; Lattice gas automata program;
                 parallel; parallel lattice gas program; Parallel
                 lattice gas program; physics computing",
  pubcountry =   "Germany",
  sponsororg =   "Danish Comput. Centre for Res. and Educ.; Inst. Math.
                 Modelling; Tech. Univ. Denmark",
  thesaurus =    "Cellular automata; Flow simulation; Lattice gas;
                 Parallel algorithms; Physics computing",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Bubak:1994:PDS,
  author =       "M. Bubak and J. Mosciniski and M. Pogoda and W.
                 Zdechlikiewicz",
  title =        "Parallel distributed {2-D} short-range molecular
                 dynamics on networked workstations",
  crossref =     "Dongarra:1994:PSC",
  pages =        "127--135",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci, Akademia Gorniczo-Hutnicza,
                 Cracow, Poland",
  classification = "A0260 (Numerical approximation and analysis); A0270
                 (Computational techniques); A6120J (Computer simulation
                 of static and dynamic liquid behaviour); C4240C
                 (Computational complexity); C4240P (Parallel
                 programming and algorithm theory); C6110P (Parallel
                 programming); C7320 (Physics and chemistry computing)",
  keywords =     "Computational complexity; Execution time;
                 Lennard-Jones systems; Link lists; Memory requirements;
                 Neighbour lists; Networked workstations; Parallel
                 algorithm; Parallel distributed 2-D short-range
                 molecular dynamics; PVM programming environment",
  thesaurus =    "Computational complexity; Lennard-Jones potential;
                 Molecular dynamics method; Parallel algorithms; Physics
                 computing",
}

@InProceedings{Burrer:1994:RRB,
  author =       "C. Burrer and P. Remy",
  title =        "{RUBIS}: a runtime basic interface software on {TELMAT
                 T9000 TN} series",
  crossref =     "deGloria:1994:TAS",
  pages =        "63--78",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "TELMAT MULTINODE, Soultz, France",
  classification = "C6110P (Parallel programming); C6150C (Compilers,
                 interpreters and other processors); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150J
                 (Operating systems); C6150N (Distributed systems
                 software)",
  keywords =     "Code portability; Compiling; Configuration; Functional
                 flexibility; INMOS TOOLSET environment; Local resources
                 management; Micro-kernel; MPI prototype; Object
                 abstractions; Parallel programs; Parallel transputer
                 machines; PVM subsystem; RUBIS; Runtime basic interface
                 software; T.Paragraph post-mortem tool; TELMAT T9000 TN
                 series",
  thesaurus =    "Operating system kernels; Parallel programming;
                 Parallelising compilers; Program diagnostics; Software
                 performance evaluation; Software portability;
                 Transputer systems",
}

@InProceedings{Campanai:1994:EAS,
  author =       "M. Campanai and O. Morales and S. Viti and R. Trotta
                 and P. Viliani and M. {Lo Moro}",
  title =        "Experiences assessing software testing activities: the
                 adoption of {PVM}, a prediction and validation model",
  crossref =     "Anonymous:1994:SQC",
  pages =        "491--500",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C0310F (Software development management); C6110S
                 (Software metrics); C6150G (Diagnostic, testing,
                 debugging and evaluating systems)",
  corpsource =   "CESVIT/CQ ware, Florence, Italy",
  keywords =     "activity optimization; improvement path; management;
                 managers; prediction and; program testing; program
                 verification; project leaders; PVM; software; software
                 development; software metrics; software system;
                 software testing; software testing activity assessment;
                 spatial applications; system monitoring; technicians;
                 telecommunications; validation model",
  pubcountry =   "Switzerland",
  treatment =    "P Practical",
}

@InProceedings{Casas:1994:ALM,
  author =       "J. Casas and R. Konuru and S. W. Otto and R. Prouty
                 and J. Walpole",
  title =        "Adaptive load migration systems for {PVM}",
  crossref =     "IEEE:1994:PSW",
  pages =        "390--399",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://sc94.ameslab.gov/AP/contents.html",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci. and Eng., Oregon Graduate Inst.
                 of Sci. and Technol., Portland, OR, USA",
  classification = "C6150N (Distributed systems software)",
  conflocation = "Washington, DC, USA; 14-18 Nov. 1994",
  conftitle =    "Proceedings of Supercomputing '94",
  corpsource =   "Dept. of Comput. Sci. and Eng., Oregon Graduate Inst.
                 of Sci. and Technol., Portland, OR, USA",
  keywords =     "adaptive data movement; Adaptive data movement;
                 adaptive load distribution; Adaptive load distribution;
                 adaptive load migration systems; Adaptive load
                 migration systems; ADM; allocation; effectiveness;
                 Effectiveness; heterogeneous workstation network;
                 message passing; message passing system; Message
                 passing system; migratable PVM; Migratable PVM; MPVM;
                 parallel algorithms; parallel applications; Parallel
                 applications; performance; Performance; programming;
                 programming methodology; Programming methodology;
                 resource; shared; Shared heterogeneous workstation
                 network; transparent migration; Transparent migration;
                 Unix; Unix process; UPVM; usability; Usability;
                 user-level PVM; User-level PVM; virtual machines;
                 virtual processors; Virtual processors; workstation
                 environment changes; Workstation environment changes",
  sponsororg =   "IEEE Comput. Soc.; ACM; SIAM",
  thesaurus =    "Message passing; Parallel algorithms; Programming;
                 Resource allocation; Unix; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Castagnera:1994:NEP,
  author =       "K. Castagnera and D. Cheng and R. Fatoohi and E. Hook
                 and B. Kramer and C. Manning and J. Musch and C.
                 Niggley and W. Saphir and D. Sheppard and M. Smith and
                 I. Stockdale and S. Welch and R. Williams and D. Yip",
  title =        "{NAS} experiences with a prototype cluster of
                 workstations",
  crossref =     "IEEE:1994:PSW",
  pages =        "410--419",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "NAS Syst. Div., NASA Ames Res. Center, Moffett Field,
                 CA, USA",
  classification = "C5430 (Microcomputers); C5470 (Performance
                 evaluation and testing); C5620 (Computer networks and
                 techniques); C6150N (Distributed systems software)",
  keywords =     "Aeroscience problems; Computational fluid dynamics;
                 Cycle recovery; Loosely coupled cluster; NAS; NAS
                 Parallel Benchmarks; OVERFLOW-PVM; Performance
                 evaluation; Primary system users; Prototype cluster;
                 Silicon Graphics; System management issues; Workstation
                 cluster",
  thesaurus =    "Distributed processing; Fluid dynamics; Message
                 passing; Performance evaluation; Physics computing;
                 Workstations",
}

@InProceedings{Cheng:1994:PDP,
  author =       "D. Cheng and R. Hood",
  title =        "A portable debugger for parallel and distributed
                 programs",
  crossref =     "IEEE:1994:PSW",
  pages =        "723--732",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Comput. Sci. Corp., NASA Ames Res. Center, Moffett
                 Field, CA, USA",
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6150G (Diagnostic, testing, debugging
                 and evaluating systems); C6150N (Distributed systems
                 software)",
  keywords =     "Client portability; Client server interaction; Client
                 server model; Debugger code; Distributed programs;
                 Message passing implementations; Message passing
                 library; Message passing programs; MPI programs;
                 Parallel programs; Portable debugger; Process
                 abstractions; PVM; Server components; Tool generated
                 code; User interface",
  thesaurus =    "Client-server systems; Message passing; Parallel
                 programming; Program debugging; Software portability",
}

@Misc{Choudhary:1994:LCR,
  author =       "Alok Choudhary and Ian Foster and Geoffrey Fox and Ken
                 Kennedy and Carl Kesselman and Charles Koelbel and Joel
                 Saltz and Marc Snir",
  title =        "Languages, Compilers, and Runtime Systems Support for
                 Parallel Input-Output",
  year =         "1994",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/Parallel.io.bib",
  note =         "Scalable I/O Initiative Working Paper Number 3. On WWW
                 at \path=http://www.ccsf.caltech.edu/SIO/SIO.html=.",
  URL =          "http://www.ccsf.caltech.edu/SIO/SIO.html",
  comment =      "Language extensions to support parallel I/O. Compiler
                 optimizations. Runtime library to support the compiler
                 and interface with the native file system. Compiler
                 would develop a mapping of data to the processor
                 memories and to the disks, and then decide on I/O
                 schedules to move data around, overlap I/O with
                 computation, even move computation around to best fit
                 what is available in memory at a given time. It can
                 also help with checkpointing. Compiler should pass info
                 to the runtime system, which in turn may need to pass
                 info to the file system, to help with optimization. I/O
                 scheduling includes reordering accesses; they even go
                 so far as to propose doing seek optimization in the
                 runtime library. Support for collective I/O. Extension
                 of MPI to I/O, to take advantage of its support for
                 asynchrony, scatter-gather, {\em etc}. On the way, they
                 hope to work with the FS people to decide on the
                 functional requirements of the file system. See also
                 poole:sio-survey, bagrodia:sio-character,
                 bershad:sio-os.",
  keyword =      "parallel I/O, multiprocessor file system, pario bib",
}

@InProceedings{Clarke:1994:MMP,
  author =       "L. Clarke and I. Glendinning and R. Hempel",
  title =        "The {MPI Message Passing Interface Standard}",
  crossref =     "Decker:1994:PEM",
  pages =        "213--218",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Coelho:1994:EHC,
  author =       "F. Coelho",
  title =        "Experiments with {HPF} compilation for a network of
                 workstations",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "423--428",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Centre de Recherche en Inf., Ecole des Mines de Paris,
                 Fontainebleau, France",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150C (Compilers, interpreters and other
                 processors)",
  keywords =     "Communication hardware; Data-parallel Fortran;
                 Distributed memory multiprocessors; High Performance
                 Fortran; HPF compilation; Optimizing compiler; PVM
                 3-based generated code; Scalable performance;
                 Workstation network",
  thesaurus =    "FORTRAN; Optimising compilers; Parallel languages;
                 Parallelising compilers",
}

@Article{Cooper:1994:CHF,
  author =       "M. D. Cooper and N. A. Burton and R. J. Hall and I. H.
                 Hillier",
  title =        "Combined {Hartree--Fock} and density functional
                 theory: a distributed memory parallel implementation",
  journal =      j-J-MOL-STRUCT-THEOCHEM,
  volume =       "121",
  pages =        "97--107",
  month =        dec,
  year =         "1994",
  CODEN =        "THEODJ",
  ISSN =         "0166-1280 (print), 1872-7999 (electronic)",
  ISSN-L =       "0166-1280",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Chem., Manchester Univ., UK",
  classification = "A3115 (General mathematical and computational
                 developments for atoms and molecules); A3120J (Local
                 density approximation (atoms and molecules)); C6110P
                 (Parallel programming); C7320 (Physics and chemistry
                 computing)",
  fjournal =     "Journal of molecular structure. Theochem",
  keywords =     "Ab initio code; Density functional theory; Direct SCF
                 energy evaluation; Distributed memory parallel
                 implementation; GAUSSIAN 92; Gradient evaluation;
                 Hartree--Fock theory; Hewlett--Packard 9000-7xx series;
                 Kohn--Sham density functional code; Parallel Virtual
                 Machine; Parallelisation; Portable communications
                 package; PRISM algorithm; PVM; Two-electron integrals;
                 UNIX workstations; Workstation cluster",
  pubcountry =   "Netherlands",
  thesaurus =    "Ab initio calculations; Density functional theory;
                 Distributed memory systems; HF calculations; Parallel
                 algorithms; Parallel programming; Physics computing;
                 SCF calculations",
}

@InProceedings{Cote:1994:PSA,
  author =       "J. Cote and S. J. Thomas",
  title =        "Parallel Semi-{Lagrangian} Advection on the Sphere
                 Using {PVM}",
  crossref =     "Pierce:1994:PSH",
  pages =        "470--477",
  year =         "1994",
  bibdate =      "Mon Oct 26 07:49:42 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. de Recherche en Prevision Numerique, Environment
                 Canada, Dorval, Que., Canada",
  classification = "C1160 (Combinatorial mathematics); C4240P (Parallel
                 programming and algorithm theory); C4260 (Computational
                 geometry); C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C6150N
                 (Distributed systems)",
  keywords =     "Courant-Friedrichs-Lewy condition; Distributed MIMD
                 parallel algorithms; Eulerian methods; Intel iPSC/860;
                 Numerical methods; Parallel message-passing
                 implementation; Parallel performance; Parallel
                 semi-Lagrangian advection; Parallel virtual machine;
                 PVM; Semi-Lagrangian method; Shallow-water equations;
                 Sphere; Spherical geometry; Sub-grid dimensions",
  thesaurus =    "Computational geometry; Hypercube networks; Message
                 passing; Parallel algorithms; Parallel machines",
}

@InProceedings{Cote:1994:PSL,
  author =       "J. Cote and S. J. Thomas",
  title =        "Parallel {semi-Lagrangian} advection on the sphere
                 using {PVM}",
  crossref =     "Dekker:1994:MPP",
  pages =        "801--808",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A0260 (Numerical approximation and analysis); A0340G
                 (Fluid dynamics: general mathematical aspects); A4710
                 (General fluid dynamics theory, simulation and other
                 computational methods); C1160 (Combinatorial
                 mathematics); C4160 (Numerical integration and
                 differentiation); C4240P (Parallel programming and
                 algorithm theory); C4260 (Computational geometry);
                 C5220P (Parallel architecture); C5440 (Multiprocessing
                 systems); C5440 (Multiprocessor systems and
                 techniques); C6150N (Distributed systems software);
                 C6150N (Distributed systems); C7320 (Physics and
                 chemistry computing)",
  corpsource =   "Div. de Recherche en Prevision Numerique, Environment
                 Canada, Dorval, Que., Canada",
  keywords =     "algorithms; computational fluid dynamics;
                 computational geometry; Courant Friedrichs Lewy
                 condition; Courant-Friedrichs-Lewy condition;
                 dimensions; distributed memory systems; distributed
                 MIMD implementation; distributed MIMD parallel
                 algorithms; Eulerian methods; evaluation; fluid
                 dynamics; hypercube networks; integration; Intel
                 iPSC/860; Lagrangian method; message passing; methods;
                 numerical; parallel; parallel algorithms; parallel
                 machines; parallel message-passing implementation;
                 parallel semi-Lagrangian advection; passive advection;
                 performance; physics computing; problem; processor;
                 PVM; scalable code; semi-; shallow-water equations;
                 software performance; sphere; spherical geometry;
                 sub-grid; sub-grid dimensions; time steps; transport
                 processes; virtual machine",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Supercomput.
                 Appl",
  treatment =    "P Practical",
  xxauthor =     "S. J. Thomas and J. Cote",
}

@InProceedings{Cownie:1994:PPP,
  author =       "J. Cownie and A. Dunlop and S. Hellberg and A. J. G.
                 Hey and D. Pritchard",
  title =        "Portable parallel programming environments-the {ESPRIT
                 PPPE} project",
  crossref =     "Dekker:1994:MPP",
  pages =        "135--142",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Meiko Ltd., Bristol, UK",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support)",
  keywords =     "ESPRIT PPPE project; European hardware manufacturers;
                 HPF mapper; Integrated tool environments; Large
                 distributed memory parallel computers; Large scale
                 scientific and engineering applications; Mainstream
                 applications development; Migration aids; Open system
                 standards; Parallel architectures; Parallel debugger;
                 PARMACS/MPI; PCTE; Performance monitor; Portable
                 parallel programming environments; Program debugger;
                 Run-time environment; Software houses",
  thesaurus =    "Parallel architectures; Parallel programming;
                 Programming environments; Research initiatives;
                 Software houses; Software portability; Standards",
}

@Article{daCunha:1994:PIR,
  author =       "Rudnei Dias {da Cunha} and Tim Hopkins",
  title =        "A parallel implementation of the restarted {GMRES}
                 iterative algorithm for nonsymmetric systems of linear
                 equations",
  journal =      j-ADV-COMPUT-MATH,
  volume =       "2",
  number =       "3",
  pages =        "261--277",
  month =        "????",
  year =         "1994",
  CODEN =        "ACMHEX",
  ISSN =         "1019-7168",
  ISSN-L =       "1019-7168",
  bibdate =      "Mon Oct 07 09:09:23 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We describe the parallelisation of the GMRES$ (c) $
                 algorithm and its implementation on distributed-memory
                 architectures, using both networks of transputers and
                 networks of workstations under the PVM message-passing
                 system. The test systems of linear equations considered
                 are those derived from five-point finite-difference
                 discretisations of partial differential equations. A
                 theoret model of the computation and communication
                 phases is presented which allows us to decide for which
                 values of the parameter $c$ our implementation executes
                 efficiently. The results show that for reasonably large
                 discretisation grids the implementations are effective
                 on a large number of processors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Centro de Processamento de Dados, Univ. Federal do Rio
                 Grande do Sul, Brazil",
  classification = "C4130 (Interpolation and function approximation);
                 C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory); C5440
                 (Multiprocessor systems and techniques)",
  fjournal =     "Advances in computational mathematics",
  keywords =     "(65F10) Numerical analysis; (65Y05) Numerical
                 analysis; Communication phases; Computer aspects of
                 numerical algorithms; Distributed-memory architectures;
                 Five-point finite-difference discretisations; Iterative
                 methods for linear systems (See also 65N22); Networks
                 of transputers; Networks of workstations; Nonsymmetric
                 systems of linear equations; Numerical linear algebra;
                 Parallel computation; Parallel implementation; Partial
                 differential equations; PVM message-passing system;
                 Restarted GMRES iterative algorithm",
  pubcountry =   "Switzerland",
  thesaurus =    "Distributed memory systems; Finite difference methods;
                 Iterative methods; Message passing; Parallel
                 algorithms; Partial differential equations",
}

@InProceedings{Damodaran-Kamal:1994:MSR,
  author =       "S. K. Damodaran-Kamal and J. M. Francioni",
  title =        "mdb: a semantic race detection tool for {PVM}",
  crossref =     "Pierce:1994:PSH",
  pages =        "702--709",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Southwestern Louisiana Univ.,
                 Lafayette, LA, USA",
  classification = "C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems)",
  corpsource =   "Dept. of Comput. Sci., Southwestern Louisiana Univ.,
                 Lafayette, LA, USA",
  keywords =     "C; C programs; debugging tool; Debugging tool;
                 detection; deterministic replay; Deterministic replay;
                 erroneous executions; Erroneous executions; error;
                 error detection; Error detection; expressions; Fortran
                 programs; hazards and race conditions; mdb; Mdb;
                 message; message passing parallel programs; Message
                 passing parallel programs; nondeterminism;
                 Nondeterminism; parallel programming; passing; program
                 debugging; programs; PVM; receive operation; Receive
                 operation; run-time detection; Run-time detection;
                 semantic; Semantic expressions; semantic race detection
                 tool; Semantic race detection tool; sequential debugger
                 invocation; Sequential debugger invocation",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Supercomput.
                 Appl",
  thesaurus =    "Error detection; Hazards and race conditions; Message
                 passing; Parallel programming; Program debugging",
  treatment =    "P Practical",
}

@InProceedings{Damodaran-Kamal:1994:TRP,
  author =       "S. K. Damodaran-Kamal and J. M. Francioni",
  title =        "Testing races in parallel programs with an {OtOt}
                 strategy",
  crossref =     "Ostrand:1994:PIS",
  journal =      j-SIGSOFT,
  year =         "1994",
  CODEN =        "SFENDP",
  ISSN =         "0163-5948",
  ISSN-L =       "0163-5948",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Department of Comput. Sci., Southwestern Louisiana
                 Univ., Lafayette, LA, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C7430
                 (Computer engineering)",
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  issue =        "spec. issue. p. 216-27",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
  keywords =     "Concurrent programs; Controlled execution; Debugging
                 tool; Exponential complexity; General-purpose run-time
                 testing technique; Mdb; Nondeterminism;
                 One-thread-at-one-time strategy; OtOt strategy;
                 Parallel Virtual Machine; Polynomial time complexity;
                 Race conditions specification; Race detection; Race
                 expressions; Race testing; Unrestricted message passing
                 parallel programs",
  thesaurus =    "Computational complexity; Hazards and race conditions;
                 Message passing; Parallel programming; Program
                 debugging; Program testing; Virtual machines",
}

@Article{Dean:1994:CPV,
  author =       "C. E. Dean and R. C. Denny and P. C. Stephenson and G.
                 J. Milne and E. Pantos",
  title =        "Computing with parallel virtual machines",
  journal =      j-J-PHYS-IV-COLLOQUE,
  volume =       "4",
  number =       "C9",
  pages =        "C9/445--448",
  month =        nov,
  year =         "1994",
  CODEN =        "JPICEI",
  ISSN =         "1155-4339",
  ISSN-L =       "1155-4339",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "SERC Daresbury Lab., Warrington, UK",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C7320 (Physics and
                 chemistry computing)",
  fjournal =     "Journal de physique. IV, Colloque",
  keywords =     "64-Node Intel iPSC/860 hypercube; Computing elements;
                 CPU performance; DALAI; LSQINT; Parallel execution;
                 Parallel virtual machines; PATTERN; Processing time;
                 PROJECT; Single program multiple data; Synchrotron
                 radiation",
  thesaurus =    "Parallel programming; Physics computing; Synchrotron
                 radiation",
}

@Article{DeKeyser:1994:RTL,
  author =       "J. DeKeyser and K. Lust and D. Roose",
  title =        "Run-time load balancing support for a parallel
                 multiblock {Euler\slash Navier--Stokes} code with
                 adaptive refinement on distributed memory computers",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "8",
  pages =        "1069--1088",
  month =        aug,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Katholieke Univ., Leuven,
                 Belgium",
  classification = "A4710 (General fluid dynamics theory, simulation and
                 other computational methods); C4185 (Finite element
                 analysis); C4240P (Parallel programming and algorithm
                 theory); C6150N (Distributed systems software)",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "Adaptive mesh refinement; Distributed memory
                 computers; Euler/Navier--Stokes code; Load balance;
                 Load balancing; Mesh refinement; Parallel algorithm;
                 Parallel performance; Parallel programming library",
  pubcountry =   "Netherlands",
  thesaurus =    "Distributed memory systems; Finite element analysis;
                 Fluid dynamics; Navier--Stokes equations; Parallel
                 algorithms; Resource allocation",
}

@InProceedings{DeRoeck:1994:CFP,
  author =       "Y. H. {De Roeck} and R. E. Plessix",
  title =        "Combining {F90} and {PVM} to Construct Synthetic
                 Seismograms by Ray-Tracing",
  crossref =     "IEEE:1994:OOE",
  volume =       "2",
  pages =        "II-653--II-658",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IFREMER, Brest, France",
  classification = "A9130R (Controlled source seismology); A9365 (Data
                 and information; A9385 (Instrumentation and techniques
                 for geophysical, hydrospheric and lower atmosphere
                 research); acquisition, processing, storage and
                 dissemination in geophysics); B6140C (Optical
                 information, image and video signal processing); B7710
                 (Geophysical techniques and equipment); C5260B
                 (Computer vision and image processing techniques);
                 C7340 (Geophysics computing)",
  conflocation = "Brest, France; 13-16 Sept. 1994",
  conftitle =    "Proceedings of OCEANS'94",
  corpsource =   "IFREMER, Brest, France",
  keywords =     "Energy attenuation; energy attenuation; Explosion
                 seismology; explosion seismology; F90; geophysical
                 prospecting; geophysical signal processing; geophysical
                 techniques; inverse problems; Marine reflection seismic
                 record; marine reflection seismic record; Measurement
                 technique; measurement technique; Modell; modell;
                 Multiple echo; multiple echo; Oceanic crust; oceanic
                 crust; Parallel Virtual Machine; Parallelisation;
                 parallelisation; profiling; Prospecting; prospecting;
                 PVM; Ray-conversion; ray-conversion; Ray-tracing;
                 ray-tracing; Seafloor; seafloor; seismic reflection;
                 Seismic reflection profiling; seismology; Synthetic
                 seismogram; synthetic seismogram; Vectorisation;
                 vectorisation",
  sponsororg =   "Oceanic Eng. Soc. IEEE; Soc. Electr. Electron. France;
                 Communaute Urbaine de Brest",
  thesaurus =    "Geophysical prospecting; Geophysical signal
                 processing; Geophysical techniques; Inverse problems;
                 Seismology",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Deshpande:1994:ADN,
  author =       "Manish Deshpande and Jinzhang Feng and Charles L.
                 Merkle and Ashish Deshpande",
  title =        "Application of a Distributed Network in Computational
                 Fluid Dynamic Simulations",
  journal =      j-IJSA,
  volume =       "8",
  number =       "1",
  pages =        "64--67",
  month =        "Spring",
  year =         "1994",
  CODEN =        "IJSAE9",
  ISSN =         "0890-2720",
  bibdate =      "Tue Feb 18 09:47:23 MST 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib; UnCover
                 library database",
  abstract =     "A general-purpose 3-D, incompressible Navier--Stokes
                 algorithm is implemented on a network of concurrently
                 operating workstations using PVM and compared with its
                 performance on a CRAY Y-MP and on an Intel iPSC\slash
                 860. The problem is relatively computationally
                 intensive, and has a communication structure based
                 primarily on nearest-neighbor communication, making it
                 ideally suited to message passing. Such problems are
                 frequently encountered in CFD, and their solution is
                 increasingly in demand. The communication structure is
                 explicitly coded in the implementation to fully exploit
                 the regularity in message passing in order to produce a
                 near-optimal solution. Results are presented for
                 various grid sizes using up to eight processors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Mech. Eng., Pennsylvania State Univ.",
  affiliationaddress = "University Park, PA, USA",
  classification = "631.1.1; 721.1; 722.4; 723.1; 723.2; 723.5",
  fjournal =     "International Journal of Supercomputer Applications",
  journalabr =   "Int J Supercomput Appl High Perform Comput",
  keywords =     "Algorithms; Communication structure; Computational
                 complexity; Computational fluid dynamic simulations;
                 Computer simulation; Computer workstations; Concurrent
                 operations; Data structures; Data transfer; Distributed
                 computer systems; Fluid dynamics; Incompressible
                 Navier--Stokes algorithm; Nearest neighbor
                 communication; Optimization; Three dimensional",
}

@InProceedings{Dikken:1994:DDL,
  author =       "L. Dikken and F. van der Linden and J. Vesseur and P.
                 Sloot",
  title =        "{DynamicPVM}: {Dynamic} load balancing on parallel
                 systems",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "273--277",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Shell Nederland Informatieverwerking, Rijswijk,
                 Netherlands",
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  keywords =     "DynamicPVM; Load balancing; Loosely coupled
                 processors; Migration; Multi tasking; Multiuser;
                 Parallel systems; Parallel Virtual Machine; Process
                 checkpointing; PVM; Restart mechanism; Scheduling",
  thesaurus =    "Message passing; Parallel programming; Processor
                 scheduling; Resource allocation; Virtual machines",
}

@InProceedings{Dykes:1994:CCP,
  author =       "S. G. Dykes and Xiaodong Zhang and Yan Zhou and Haixu
                 Yang",
  title =        "Communication and computation patterns of large scale
                 image convolutions on parallel architectures",
  crossref =     "Siegal:1994:PEI",
  pages =        "926--931",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "High Performance Comput. and Software Lab., Texas
                 Univ., San Antonio, TX, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C5260B (Computer vision
                 and picture processing); C5440 (Multiprocessor systems
                 and techniques)",
  keywords =     "CM-5; Communication overhead; Computation patterns;
                 Convolution; Convolution calculations; Execution time;
                 Fast memory store; Imag segmentation; Image processing
                 operations; IPSC/860; Large kernel convolutions; Large
                 scale image convolutions; Memory access demand;
                 Parallel algorithms; Parallel architectures; Processor
                 power; PVM distributed memory multicomputers; Texture
                 segmentation application",
  thesaurus =    "Distributed memory systems; Image segmentation; Image
                 texture; Parallel algorithms; Parallel machines",
}

@InProceedings{Elamvazuthi:1994:OPA,
  author =       "C. Elamvazuthi and G. A. Manson",
  title =        "{Occam}, {PVM} and the Alternative Construct",
  crossref =     "Miles:1994:PTO",
  pages =        "56--68",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Sheffield Univ., UK",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6140D (High level languages); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Sheffield Univ., UK",
  keywords =     "alternation construct; Alternation construct; code;
                 Code generation; code mapping; Code mapping;
                 Communicating State Diagram; communication event;
                 Communication event; computer aided software
                 engineering; CSD; diagrammatic representation;
                 Diagrammatic representation; diagrammatic technique;
                 Diagrammatic technique; generation; Machine;
                 methodology; Methodology; model; Model process
                 behaviour; Occam; occam programming language; Occam
                 programming language; parallel; parallel architecture;
                 Parallel architecture; Parallel CASE tool; parallel
                 processing; Parallel processing; Parallel system
                 design; Parallel Virtual; Parallel Virtual Machine;
                 process behaviour; programming; PVM; software tool;
                 Software tool; software tools; system design; virtual
                 machines",
  pubcountry =   "Netherlands",
  thesaurus =    "Computer aided software engineering; Occam; Parallel
                 programming; Software tools; Virtual machines",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Eppstein:1994:CSP,
  author =       "M. J. Eppstein and D. E. Dougherty",
  title =        "A comparative study of {PVM} workstation cluster
                 implementations of a two-phase subsurface flow model",
  journal =      j-ADV-WATER-RESOURCES,
  volume =       "17",
  number =       "3",
  pages =        "181--??",
  month =        "????",
  year =         "1994",
  CODEN =        "AWREDI",
  ISSN =         "0309-1708 (print), 1872-9657 (electronic)",
  ISSN-L =       "0309-1708",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Advances in Water Resources",
}

@Article{Escaig:1994:PMD,
  author =       "Y. Escaig and G. Touzot and M. Vayssade",
  title =        "Parallelization of a multilevel domain decomposition
                 method",
  journal =      j-COMPUT-SYST-ENG,
  volume =       "5",
  number =       "3",
  pages =        "253--263",
  month =        jun,
  year =         "1994",
  CODEN =        "COSEEO",
  ISSN =         "0956-0521",
  ISSN-L =       "0956-0521",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "CRIHAN, Mont Saint Aignan, France",
  classification = "C4185 (Finite element analysis); C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C7310
                 (Mathematics computing)",
  fjournal =     "Computing systems in engineering: an international
                 journal",
  keywords =     "CRAY Y-MP; Distributed memory machines; Distributed
                 memory systems; Ethernet network; Finite element
                 method; IBM RS/6000 workstations; Interface problem;
                 MIMD; Multilevel domain decomposition method; Multiple
                 instructions multiple data; Multiprocessor machines;
                 Parallel Virtual Machine; Performance; PVM; Shared
                 memory machine; Shared memory systems",
  pubcountry =   "UK",
  thesaurus =    "Cray computers; Distributed memory systems; Finite
                 element analysis; IBM computers; Mathematics computing;
                 Performance evaluation; Shared memory systems",
}

@Article{Ewing:1994:DCW,
  author =       "Richard E. Ewing and Robert C. Sharpley and Derek
                 Mitchum and P. O'Leary and J. S. Sochacki",
  title =        "Distributed Computation of Wave Propagation Models
                 Using {PVM}",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "2",
  number =       "1",
  pages =        "26--31",
  month =        "Spring",
  year =         "1994",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/88.281870",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; UnCover
                 library database",
  abstract =     "The Parallel Vitural Machine lets researchers create a
                 powerful, inexpensive parallel system on which they can
                 solve large, sophisticated problems such as simulating
                 the propagation of seismic waves.",
  acknowledgement = ack-nhfb,
  affiliation =  "Texas A and M Univ., College Station, TX, USA",
  classification = "A9130F (Surface and body waves); C5440
                 (Multiprocessor systems and techniques); C5620
                 (Computer networks and techniques); C6110P (Parallel
                 programming); C6150N (Distributed systems); C7340
                 (Geophysics)",
  corpsource =   "Texas A and M Univ., College Station, TX, USA",
  fjournal =     "IEEE parallel and distributed technology: systems and
                 applications",
  keywords =     "C; Communication networks; communication networks;
                 Computational power; computational power; Cost
                 effectiveness; cost effectiveness; Distributed
                 computation; distributed computation; Ethernet; Fiber;
                 fiber; Fortran; geophysics computing; Hypercubes;
                 hypercubes; machines; Meshes; meshes; network operating
                 systems; parallel; parallel programming; Parallel
                 Virtual Machine; PVM; Remote procedural libraries;
                 remote procedural libraries; Rings; rings; Seismic wave
                 propagation simulation; seismic wave propagation
                 simulation; seismic waves; Wave propagation models;
                 wave propagation models",
  thesaurus =    "Geophysics computing; Network operating systems;
                 Parallel machines; Parallel programming; Seismic
                 waves",
  treatment =    "P Practical",
}

@Article{Femminella:1994:PBP,
  author =       "A. Femminella and A. Omodeo",
  title =        "{PVM-based} parallel computing: a case study on power
                 plant simulation",
  journal =      j-MICROPROC-MICROPROG,
  volume =       "40",
  number =       "10-12",
  pages =        "875--878",
  month =        dec,
  year =         "1994",
  CODEN =        "MMICDT",
  ISSN =         "0165-6074 (print), 1878-7061 (electronic)",
  ISSN-L =       "0165-6074",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Centro Ricerca di Autom., ENEL, Milan, Italy",
  classification = "B8110D (Power system planning and layout); B8200
                 (Generating stations and plants); C6110P (Parallel
                 programming); C6150N (Distributed systems software);
                 C7410B (Power engineering computing)",
  conflocation = "Liverpool, UK; Sept. 1994",
  conftitle =    "20th Annual Euromicro Conference. System Architecture
                 and Integration",
  corpsource =   "Centro Ricerca di Autom., ENEL, Milan, Italy",
  fjournal =     "Microprocessing and Microprogramming",
  keywords =     "case study; Case study; digital simulation;
                 distributed programming; Distributed programming;
                 distributed software platform; Distributed software
                 platform; heterogeneous workstation; Heterogeneous
                 workstation network; independently evolving;
                 Independently evolving processes; machines; message
                 exchange; Message exchange; network; open systems;
                 parallel programming; Parallel Virtual Machine;
                 periodic synchronization; Periodic synchronization;
                 power plant; Power plant simulation; power plants;
                 power system analysis computing; processes; PVM-based
                 parallel computing; simulation; Transputer network;
                 virtual; weakly-coupled processes; Weakly-coupled
                 processes",
  pubcountry =   "Netherlands",
  thesaurus =    "Digital simulation; Open systems; Parallel
                 programming; Power plants; Power system analysis
                 computing; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Fineberg:1994:IMM,
  author =       "S. A. Fineberg",
  title =        "Implementing multidisciplinary and multi-zonal
                 applications using {MPI}",
  crossref =     "IEEE:1994:FSF",
  pages =        "496--503",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Comput. Sci. Corp., NASA Ames Res. Center, Moffett
                 Field, CA, USA",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  conftitle =    "Proceedings Frontiers '95. The Fifth Symposium on the
                 Frontiers of Massively Parallel Computation",
  corpsource =   "Comput. Sci. Corp., NASA Ames Res. Center, Moffett
                 Field, CA, USA",
  keywords =     "codes; Codes; message passing; Message Passing
                 Interface; multidisciplinary applications;
                 Multidisciplinary applications; multizonal
                 applications; Multizonal applications; parallel
                 programming; parallel programs; Parallel programs;
                 performance; Performance; point-to-point message
                 passing routines; Point-to-point message passing
                 routines; portable library; Portable library; single
                 program multiple data stream; Single program multiple
                 data stream; standard; Standard; standards",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Comput. Archit.;
                 NASA; Univ. Maryland Inst. Adv. Comput. Studies; George
                 Mason Univ",
  thesaurus =    "Message passing; Parallel programming; Standards",
  treatment =    "P Practical",
}

@Article{Flower:1994:EJM,
  author =       "Jon Flower and Adam Kolawa",
  title =        "{Express} is not just a message passing system:
                 current and future directions in {Express}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "4",
  pages =        "597--614",
  day =          "31",
  month =        apr,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:14:00 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=4&aid=860",
  abstract =     "We describe some of the features of Express and the
                 way that they were developed as a response to the needs
                 of application programmers. We show how currently
                 emerging computing platforms have led to new
                 application needs and show how these are satisfied with
                 Express features. We introduce a recently developed
                 programming style which greatly simplifies programming
                 as well as directly addressing complex issues such as
                 dynamic load balancing and fault tolerance. Finally, we
                 present a comparison of Express' features and
                 motivation to the Message Passing Interface (MPI)
                 standard currently being developed.",
  acknowledgement = ack-nhfb,
  affiliation =  "ParaSoft Corp",
  affiliationaddress = "Pasadena, CA, USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems)",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "Application developers toolkit; Application needs;
                 Computer programming; Computing platforms; Dynamic load
                 balancing; Dynamics; Express; Express features; Fault
                 tolerance; Fault tolerance system; Message passing
                 Express; Message passing interface (MPI) standard;
                 Message passing programming style; Message passing
                 system; MPI standard",
  pubcountry =   "Netherlands",
  thesaurus =    "Message passing; Parallel programming; Resource
                 allocation",
}

@InProceedings{Franke:1994:EIM,
  author =       "H. Franke and P. Hochschild and P. Pattnaik and M.
                 Snir",
  title =        "An Efficient Implementation of {MPI}",
  crossref =     "Decker:1994:PEM",
  pages =        "219--230",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Franke:1994:MEI,
  author =       "H. Franke and P. Hochschild and P. Pattnaik and M.
                 Snir",
  title =        "{MPI-F}: An Efficient Implementation of {MPI} on
                 {IBM-SP1}",
  crossref =     "Agrawal:1994:PIC",
  pages =        "III-197--III-201",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings of 23rd Annual International Conference on
                 Parallel Processing",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  keywords =     "distributed memory cluster; distributed memory
                 systems; IBM computers; IBM-SP1; message passing; MPI;
                 parallel architectures; performance; performance
                 evaluation",
  sponsororg =   "Pennsylvania State Univ",
  treatment =    "P Practical",
}

@InProceedings{Franke:1994:MMP,
  author =       "H. Franke and P. Hochschild and P. Pattnaik and J.-P.
                 Prost and M. Snir",
  title =        "{MPI-F}: an {MPI} Prototype Implementation on {IBM
                 SP1}",
  crossref =     "Dongarra:1994:PSW",
  pages =        "43--55",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150E (General utility programs); C6150N
                 (Distributed systems software); C6180 (User
                 interfaces)",
  conftitle =    "Proceedings of the Second Workshop on Environments and
                 Tools for Parallel Scientific Computing",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  keywords =     "application program interfaces; distributed memory
                 systems; External User Interface; IBM 9076 Scalable
                 PowerPARALLEL 1 system; IBM computers; IBM SP1; message
                 passing; Message-Passing Interface; modifications;
                 MPI-F; native EUI library; parallel machines;
                 performance measurements; prototype implementation;
                 software libraries; software performance evaluation;
                 software prototyping; user interfaces",
  treatment =    "P Practical",
}

@Article{Freeman:1994:SMM,
  author =       "T. L. Freeman and J. M. Bull",
  title =        "Shared Memory and Message Passing Implementations of
                 Parallel Algorithms for Numerical Integration",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "879",
  pages =        "219--228",
  year =         "1994",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Sep 15 10:01:31 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1994.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "computing; heterogeneous network; PARA; parallel
                 scientific computing",
}

@InProceedings{Gajecki:1994:NAT,
  author =       "M. Gajecki and J. Moscinski",
  title =        "A new algorithm for the traveling salesman problem on
                 networked workstations",
  crossref =     "Dongarra:1994:PSC",
  pages =        "229--235",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci., Akademia Gorniczo-Hutnicza,
                 Cracow, Poland",
  classification = "C1160 (Combinatorial mathematics); C1180
                 (Optimisation techniques); C4240P (Parallel programming
                 and algorithm theory); C5620L (Local area networks);
                 C6150N (Distributed systems software)",
  keywords =     "Efficiency; Local optimization method; Networked
                 workstations; Parallel algorithm; PVM; SUN SPARCstation
                 IPX; Traveling salesman problem",
  thesaurus =    "Local area networks; Parallel algorithms; Travelling
                 salesman problems; Workstations",
}

@InProceedings{Geist:1994:CCW,
  author =       "G. A. Geist",
  title =        "Cluster computing: the wave of the future?",
  crossref =     "Dongarra:1994:PSC",
  pages =        "236--246",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat. Lab., TN, USA",
  classification = "C0230 (Economic, social and political aspects of
                 computing); C5620L (Local area networks); C6150N
                 (Distributed systems software); C7300 (Natural sciences
                 computing)",
  keywords =     "Cluster computing; Distributed memory computer;
                 Heterogeneous network research project; Oak Ridge
                 National Laboratory; Parallel computers; Parallel
                 Virtual Machine; Portable robust software; PVM;
                 Research issues; Scientific problems; Serial computers;
                 Social issues; Software package; Standard; Tennessee
                 University; User defined computer collection; Vector
                 computers; Workstation clusters",
  thesaurus =    "Distributed memory systems; Local area networks;
                 Natural sciences computing; Social aspects of
                 automation; Software packages; Virtual machines;
                 Workstations",
}

@Book{Geist:1994:PPV,
  author =       "Al Geist and Adam Beguelin and Jack Dongarra and
                 Weicheng Jiang and Robert Manchek and Vaidyalingam S.
                 Sunderam",
  title =        "{PVM}: Parallel Virtual Machine: a Users' Guide and
                 Tutorial for Networked Parallel Computing",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xvii + 279",
  year =         "1994",
  ISBN =         "0-262-57108-0 (paperback)",
  ISBN-13 =      "978-0-262-57108-1 (paperback)",
  LCCN =         "QA76.58 .P85 1994",
  bibdate =      "Thu Feb 29 17:35:15 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "US\$27.50",
  series =       "Scientific and engineering computation",
  URL =          "http://www.mitpress.com/book-home.tcl?isbn=0262571080",
  acknowledgement = ack-nhfb,
  keywords =     "Computer networks.; Networks --- Parallel programming;
                 Parallel computers.",
}

@MastersThesis{Grengbondai:1994:CPU,
  author =       "Jules Crephat Grengbondai",
  title =        "Concurrent processing under Parallel Virtual Machine
                 ({PVM})",
  type =         "M.S. thesis",
  school =       "Department of Computer Science, Southern Illinois
                 University at Carbondale",
  address =      "Carbondale, IL, USA",
  pages =        "vi + 97",
  year =         "1994",
  bibdate =      "Mon Jan 15 18:16:53 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Gropp:1994:MCL,
  author =       "W. Gropp and E. Lusk",
  title =        "The {MPI} communication library: its design and a
                 portable implementation",
  crossref =     "IEEE:1994:PSP",
  pages =        "160--165",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  classification = "C5440 (Multiprocessing systems); C5440
                 (Multiprocessor systems and techniques); C6110P
                 (Parallel programming); C6150N (Distributed systems
                 software); C6150N (Distributed systems)",
  conftitle =    "Proceedings of Scalable Parallel Libraries
                 Conference",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "advanced features; Advanced features; implementation
                 strategy; Implementation strategy; message passing; MPI
                 communication library; MPI standard; parallel
                 programming; portable implementation; Portable
                 implementation; software portability; standard
                 message-passing interface; Standard message-passing
                 interface; standards",
  sponsororg =   "Mississippi State Univ.; Nat. Sci. Found",
  thesaurus =    "Message passing; Parallel programming; Software
                 portability; Standards",
  treatment =    "P Practical",
}

@InProceedings{Gropp:1994:SEP,
  author =       "W. Gropp and B. Smith",
  title =        "Scalable, extensible, and portable numerical
                 libraries",
  crossref =     "IEEE:1994:PSP",
  pages =        "87--93",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  classification = "C6110B (Software engineering techniques); C6120
                 (File organisation); C6180 (User interfaces)",
  keywords =     "Aggressive data-structure-neutral implementation; Data
                 structures; Implementation language; Meta-communication
                 layer; Parallel communication technology; PETSc
                 library; Portable Extensible Tools for Scientific
                 computing; Portable numerical libraries; Software
                 portability; Software technology; User interfaces;
                 User-interface language",
  thesaurus =    "Data structures; Software portability; User
                 interfaces",
}

@Book{Gropp:1994:UMP,
  author =       "William Gropp and Ewing Lusk and Anthony Skjellum",
  title =        "Using {MPI}: Portable Parallel Programming with the
                 Message-Passing Interface",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xx + 307",
  year =         "1994",
  ISBN =         "0-262-57104-8",
  ISBN-13 =      "978-0-262-57104-3",
  LCCN =         "QA76.642 G76 1994",
  bibdate =      "Thu Feb 29 17:35:09 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "US\$24.95",
  series =       "Scientific and engineering computation",
  URL =          "http://www.mitpress.com/book-home.tcl?isbn=0262571048",
  acknowledgement = ack-nhfb,
  keywords =     "Computer interfaces.; Parallel computers ---
                 Programming.; Parallel programming; Parallel
                 programming (Computer science)",
}

@Article{Gupta:1994:CTE,
  author =       "M. Gupta and P. Banerjee",
  title =        "Compile-time estimation of communication costs of
                 programs",
  journal =      j-J-PROGRAM-LANG,
  volume =       "2",
  number =       "3",
  pages =        "191--225",
  month =        sep,
  year =         "1994",
  CODEN =        "JPLAER",
  ISSN =         "0963-9306",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  classification = "C6130 (Data handling techniques); C6150C (Compilers,
                 interpreters and other processors); C6150G (Diagnostic,
                 testing, debugging and evaluating systems)",
  fjournal =     "Journal of Programming Languages",
  keywords =     "Array references; Communication optimizations;
                 Compile-time estimation; Compiler; Data distribution;
                 Data movement; Data partitioning decisions; Distributed
                 memory machines; Fortran programs; Global address
                 space; High-level communication primitives; Loops;
                 Paradigm compiler; Processors; Program analysis;
                 Program communication costs; Ptran-II High-Performance
                 Fortran prototype compiler; Traversal properties",
  pubcountry =   "UK",
  thesaurus =    "Data handling; Distributed memory systems; Optimising
                 compilers; System monitoring",
}

@InProceedings{Haeuser:1994:RNS,
  author =       "J. Haeuser and M. Spel and J. Muylaert and R. D.
                 Williams",
  title =        "Results for the {Navier--Stokes} Solver {ParNSS} on
                 Workstation Clusters and {IBM SP1} Using {PVM}",
  crossref =     "Wagner:1994:CFD",
  pages =        "432--442",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Hakula:1994:FEM,
  author =       "H. Hakula and J. Malinen and P. Kallberg and P.
                 Valve",
  title =        "The finite element method applied to the exterior
                 {Helmholtz} problem on the {IBM SP-1}",
  crossref =     "Dongarra:1994:PSC",
  pages =        "262--269",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Helsinki Univ. of Technol., Espoo, Finland",
  classification = "A0230 (Function theory, analysis); A0260 (Numerical
                 approximation and analysis); A4110H (Electromagnetic
                 waves: theory); B0290P (Differential equations); B0290T
                 (Finite element analysis); B5210 (Electromagnetic wave
                 propagation); C4170 (Differential equations); C4185
                 (Finite element analysis); C6110P (Parallel
                 programming); C6150N (Distributed systems software);
                 C7320 (Physics and chemistry computing)",
  keywords =     "2D domains; Complex linear equations; Electromagnetic
                 waves; Exterior Helmholtz problem; Finite element
                 method; Helmholtz equation; IBM SP-1 machine; Irregular
                 meshes; Monitoring facilities; Numerical analysis
                 research; Parallel implementation; Parallel Virtual
                 Machine environment; Performance; Quasi-minimal
                 residual method; Regular meshes; Scattering problem;
                 Single program multiple data model",
  thesaurus =    "Electromagnetic wave scattering; Finite element
                 analysis; Helmholtz equations; IBM computers; Parallel
                 machines; Parallel programming; Physics computing;
                 Software performance evaluation",
}

@TechReport{Hardwick:1994:PVL,
  author =       "Jonathan C. Hardwick",
  title =        "Porting a vector library: a comparison of {MPI},
                 {Paris}, {CMMD} and {PVM} (or, ``{I'll} never have to
                 port {CVL} again'')",
  type =         "Research paper",
  number =       "CMU-CS-94-200",
  institution =  inst-SCS-CMU,
  address =      inst-SCS-CMU:adr,
  pages =        "16",
  year =         "1994",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper describes the design and implementation in
                 MPI of the parallel vector library CVL, which is used
                 as the basis for implementing nested data-parallel
                 languages such as NESL and Proteus. We compare the ease
                 of writing and debugging the portable MPI
                 implementation of CVL with our experiences writing
                 previous versions in CM-2 Paris, CM-5 CMMD, and PVM,
                 and give initial performance results for MPI CVL
                 running on an IBM SP- 1, Intel Paragon, and TMC CM-5.",
  acknowledgement = ack-nhfb,
  annote =       "An earlier version of this paper appeared in
                 `Proceedings of the 2nd Scalable Parallel Libraries
                 Conference', Mississippi State University, Mississippi,
                 October 1994. November 1994. Supported in part by the
                 Wright Laboratory, Aeronautical Systems Center, Air
                 Force Materiel Command, USAF, and the Advanced Research
                 Projects Agency (ARPA). Supported in part by the
                 Pittsburgh Supercomputing Center. Supported in part by
                 the National Center for Supercomputing Applications.
                 Supported in part by the Argonne National Laboratory.",
  keywords =     "Parallel programming (Computer science)",
}

@Article{Hellberg:1994:PPP,
  author =       "S. A. Hellberg and E. Zaluska",
  title =        "A portable parallel programming environment based
                 around {PCTE}",
  journal =      j-INFO-SOFTWARE-TECH,
  volume =       "36",
  number =       "7",
  pages =        "419--425",
  month =        jul,
  year =         "1994",
  CODEN =        "ISOTE7",
  ISSN =         "0950-5849 (print), 1873-6025 (electronic)",
  ISSN-L =       "0950-5849",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Electron. and Comput. Sci., Southampton Univ., UK",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  fjournal =     "Information and Software Technology",
  keywords =     "Distributed-memory; End-user applications; High
                 Performance Fortran; High-Performance Computing
                 Community programming standards;
                 Massively-parallel-processor; Message-Passing
                 Interface; PCTE; PCTE-based integrated toolset;
                 Performance; Portable common tool environment; Portable
                 parallel programming environment",
  pubcountry =   "UK",
  thesaurus =    "Message passing; Parallel programming; Programming
                 environments",
}

@InProceedings{Hempel:1994:MSM,
  author =       "R. Hempel",
  title =        "The {MPI Standard for Message Passing}",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "247--252",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "German Nat. Res. Center for Comput. Sci., St.
                 Augustin, Germany",
  classification = "C5220P (Parallel architecture); C6110P (Parallel
                 programming)",
  conftitle =    "High-Performance Computing and Networking
                 International Conference. Proceedings, Volume II:
                 Networking and Tools",
  corpsource =   "German Nat. Res. Center for Comput. Sci., St.
                 Augustin, Germany",
  keywords =     "message passing; Message passing; message-passing
                 interfaces; Message-passing interfaces; MPI standard;
                 parallel computing; Parallel computing; parallel
                 programming; standard; Standard; standards",
  thesaurus =    "Message passing; Parallel programming; Standards",
  treatment =    "P Practical",
}

@Article{Henriksen:1994:PCF,
  author =       "P. Henriksen and R. Keunings",
  title =        "Parallel computation of the flow of integral
                 viscoelastic fluids on a heterogeneous network of
                 workstations",
  journal =      j-INT-J-NUMER-METHODS-FLUIDS,
  volume =       "18",
  number =       "12",
  pages =        "1167--1183",
  month =        jun,
  year =         "1994",
  CODEN =        "IJNFDW",
  ISSN =         "0271-2091",
  ISSN-L =       "0271-2091",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ. Catholique de Louvain, Belgium",
  classification = "A4750 (Non-Newtonian dynamics); C4240P (Parallel
                 programming and algorithm theory); C7320 (Physics and
                 Chemistry)",
  fjournal =     "International Journal for Numerical Methods in
                 Fluids",
  keywords =     "Computational mechanics problems; Compute-intensive
                 treatment; Deformation; Dynamic allocation; Fibre
                 suspension flow; Flow; Heterogeneous network of
                 workstations; Integral viscoelastic fluids; Internal
                 variables; Load balancing; Parallel algorithms;
                 Parallel computation; Parallel efficiency; POLYFLOW
                 package; Public domain PVM software library; Static
                 allocation; Viscoplastic solids",
  pubcountry =   "UK",
  thesaurus =    "Flow simulation; Non-Newtonian flow; Parallel
                 algorithms; Physics computing",
}

@InProceedings{Hiranandani:1994:CTB,
  author =       "S. Hiranandani and K. Kennedy and J. Mellor-Crummey
                 and A. Sethi",
  title =        "Compilation techniques for block-cyclic
                 distributions",
  crossref =     "ACM:1994:CPI",
  pages =        "392--403",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Rice Univ., Houston, TX, USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150C (Compilers, interpreters and other
                 processors)",
  keywords =     "Block-cyclic distributions; Code; Compilers; Data
                 alignment; Data-parallel languages; Fortran D;
                 High-Performance Fortran; Linear-time algorithm; Memory
                 access sequence; MIMD distributed-memory machines;
                 Nonunit strides; Symbolic array dimensions; Symbolic
                 loop bounds",
  thesaurus =    "FORTRAN; Parallel languages; Program compilers",
}

@InProceedings{Issman:1994:PME,
  author =       "E. Issman and G. Degrez and J. {De Keyser}",
  title =        "A Parallel Multiblock {Euler\slash Navier--Stokes}
                 Solver on a Cluster of Workstations Using {PVM}",
  crossref =     "Gentzsch:1994:HPC",
  volume =       "1",
  pages =        "157--162",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "CFD Group, Von Karman Inst. for Fluid Dynamics,
                 Rhode-St.-Genese, Belgium",
  classification = "A4710 (General fluid dynamics theory, simulation and
                 other computational methods); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming); C6150J
                 (Operating systems); C6150N (Distributed systems
                 software); C7320 (Physics and chemistry computing)",
  corpsource =   "CFD Group, Von Karman Inst. for Fluid Dynamics,
                 Rhode-St.- Genese, Belgium",
  keywords =     "adaptive 2D-multiblock Euler/Navier--Stokes; Adaptive
                 2D-multiblock Euler/Navier--Stokes solver; automatic
                 load-; Automatic load-balancing; balancing; Block
                 distribution; block distribution; cluster; environment;
                 LOGO software library; Navier--Stokes equations;
                 parallel; Parallel computer; parallel computer;
                 parallel machines; Parallel multiblock
                 Euler/Navier--Stokes solver; parallel multiblock
                 Euler/Navier--Stokes solver; Parallel Virtual Machine
                 communication software; Parallelised; parallelised;
                 physics computing; Processors; processors; programming;
                 PVM; resource allocation; Run-time; run-time; solver;
                 Solver porting; solver porting; Unix; Unix workstation
                 cluster; Unix workstation cluster environment;
                 workstation; Workstation cluster",
  pubcountry =   "Germany",
  thesaurus =    "Navier--Stokes equations; Parallel machines; Parallel
                 programming; Physics computing; Resource allocation;
                 Unix",
  treatment =    "P Practical",
}

@Article{Iwashita:1994:IPE,
  author =       "S. Iwashita and K. Murakami",
  title =        "Implementation and performances evaluation of {KU
                 PVM3\slash AP1000}",
  journal =      j-ENG-SCI-REP-KYUSHU,
  volume =       "16",
  number =       "3",
  pages =        "345--352",
  month =        dec,
  year =         "1994",
  CODEN =        "SRKHEK",
  ISSN =         "0388-1717",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6110P (Parallel programming);
                 C6150N (Distributed systems software)",
  fjournal =     "Engineering Sciences Reports, Kyushu University",
  keywords =     "Basic communication intensive benchmarks; KU
                 PVM3/AP1000; Network configurations; Parallel computer;
                 Parallel programming library; Performance evaluation;
                 Virtual workstation cluster; Workstation clusters",
  language =     "Japanese",
  pubcountry =   "Japan",
  thesaurus =    "Parallel machines; Parallel programming; Performance
                 evaluation; Software libraries; Software performance
                 evaluation",
}

@InProceedings{Joubert:1994:PAL,
  author =       "A. Joubert",
  title =        "Parallel algorithms for linear and nonlinear equations
                 derived from networks",
  crossref =     "Joubert:1994:PCT",
  pages =        "145--152",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "London Parallel Applications Centre, Queen Mary and
                 Westfield Coll., UK",
  classification = "C4140 (Linear algebra); C4150 (Nonlinear and
                 functional equations); C4240P (Parallel programming and
                 algorithm theory)",
  keywords =     "Linear equations; Load flow; Nonlinear equations;
                 Power systems",
  thesaurus =    "Graph theory; Linear algebra; Network analysis;
                 Nonlinear equations; Parallel algorithms",
}

@InProceedings{Judd:1994:PIV,
  author =       "D. Judd and N. K. Ratha and P. K. McKinley and J. Weng
                 and A. K. Jain",
  title =        "Parallel implementation of vision algorithms on
                 workstation clusters",
  crossref =     "IEEE:1994:PIF",
  pages =        "317--321 (vol. 3)",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  classification = "B6140C (Optical information, image and video signal
                 processing); C1220 (Simulation, modelling and
                 identification); C5260B (Computer vision and image
                 processing techniques); C6110P (Parallel programming)",
  keywords =     "Distributed cluster platforms; Motion parameter
                 estimation algorithm; Sequential CLUSTER program;
                 Square-error data clustering method; Vision algorithms;
                 Workstation clusters",
  thesaurus =    "Computer vision; Parallel algorithms; Parameter
                 estimation",
}

@Article{Karamcheti:1994:SOM,
  author =       "Vijay Karamcheti and Andrew A. Chien",
  title =        "Software overhead in messaging layers: where does the
                 time go?",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "11",
  pages =        "51--60",
  month =        nov,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat May 1 15:50:17 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p51-karamcheti/",
  abstract =     "Despite improvements in network interfaces and
                 software messaging layers, software communication
                 overhead still dominates the hardware routing cost in
                 most systems. In this study, we identify the sources of
                 this overhead by analyzing software costs of typical
                 communication protocols built atop the active messages
                 layer on the CM-5. We show that up to 50-70\% of the
                 software messaging costs are a direct consequence of
                 the gap between specific network features such as
                 arbitrary delivery order, finite buffering, and limited
                 fault-handling, and the user communication requirements
                 of in-order delivery, end-to-end flow control, and
                 reliable transmission. However, virtually all of these
                 costs can be eliminated if routing networks provide
                 higher-level services such as in-order delivery,
                 end-to-end flow control, and packet-level
                 fault-tolerance. We conclude that significant cost
                 reductions require changing the constraints on
                 messaging layers: we propose designing networks and
                 network interfaces which simplify or replace software
                 for implementing user communication requirements.",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5440 (Multiprocessing systems);
                 C5610N (Network interfaces); C5640 (Protocols); C6150N
                 (Distributed systems software)",
  conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
  conftitle =    "Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI)",
  corpsource =   "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
                 USA",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "active messages layer; arbitrary delivery order finite
                 buffering; CM-5; communication protocols; cost
                 reductions; cost reductions packet level fault
                 tolerance; design; end-to-end flow control; hardware
                 routing cost; in-order delivery; limited fault
                 handling; measurement; message passing; messaging
                 layers; network features; network interfaces; packet
                 level fault tolerance; parallel machines; performance;
                 protocols; reliable transmission; software
                 communications overhead; software messaging costs;
                 software messaging layers; software overhead;
                 standardization; theory; user communication
                 requirements",
  sponsororg =   "ACM; IEEE Comput. Soc",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Message sending. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS. {\bf C.2.1}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Architecture and Design. {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors),
                 Parallel processors**.",
  treatment =    "P Practical",
}

@InProceedings{Karrels:1994:PAM,
  author =       "E. Karrels and E. Lusk",
  title =        "Performance Analysis of {MPI} Programs",
  crossref =     "Dongarra:1994:PSW",
  pages =        "195--200",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6115 (Programming support); C6150E (General utility
                 programs); C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings of the Second Workshop on Environments and
                 Tools for Parallel Scientific Computing",
  corpsource =   "Dept. of Comput. Sci., Wisconsin Univ., Oshkosh, WI,
                 USA",
  keywords =     "application program interfaces; functions library;
                 message passing; Message Passing Interface; MPI
                 programs; parallel computation; parallel programming;
                 performance analysis; portable publicly available
                 implementation; profiling interface; profiling
                 libraries; profiling tools; software libraries;
                 software performance evaluation; specification",
  treatment =    "P Practical",
}

@InProceedings{Knies:1994:SLL,
  author =       "A. D. Knies and F. R. Barriuso and W. J. Harrod and G.
                 B. {Adams, III}",
  title =        "{SLICC}: a low latency interface for collective
                 communications",
  crossref =     "IEEE:1994:PSW",
  pages =        "89--96",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Purdue Univ., West Lafayette, IN, USA",
  classification = "C5220P (Parallel architecture); C6150N (Distributed
                 systems software)",
  keywords =     "Cray T3D; Directly memory access; Interprocessor
                 communications; Low latency interface; Low-level
                 collective communications interface; Parallel
                 computers; Performance results; PVM; Referenced
                 processing element; Shared address-space library
                 interface; Shared distributed memory systems; SLICC;
                 Software models",
  thesaurus =    "Application program interfaces; Cray computers;
                 Distributed memory systems; Message passing; Shared
                 memory systems; Software libraries; Software
                 performance evaluation",
}

@InProceedings{Konuru:1994:ULP,
  author =       "R. Konuru and J. Casas and R. Prouty and S. Otto and
                 J. Walpole",
  title =        "A user-level process package for {PVM}",
  crossref =     "Pierce:1994:PSH",
  pages =        "48--55",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessor systems and techniques); C6110P
                 (Parallel programming); C6150J (Operating systems);
                 C6150N (Distributed systems)",
  corpsource =   "Dept. of Comput. Sci. and Eng., Oregon Graduate Inst.
                 of Sci. and Technol., Beaverton, OR, USA",
  keywords =     "dynamic load balancing; lightweight; message passing;
                 message-based; operating systems (computers); parallel;
                 parallel programming; parallel programs; performance
                 evaluation; processor; programming; PVM; resource
                 allocation; source-code compatible PVM interface;
                 SPMD-style PVM applications; standard PVM; UPVM;
                 user-level process package; virtual processors;
                 virtualization",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Supercomput.
                 Appl",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Konuru:1994:UPP,
  author =       "R. Konuru and J. Casas and R. Prouty and S. Otto and
                 J. Walpole",
  title =        "A User-Level Process Package for {PVM}",
  crossref =     "Pierce:1994:PSH",
  pages =        "48--55",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci. and Eng., Oregon Graduate Inst.
                 of Sci. and Technol., Beaverton, OR, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessor systems and techniques); C6110P
                 (Parallel programming); C6150J (Operating systems);
                 C6150N (Distributed systems)",
  keywords =     "Dynamic load balancing; Lightweight virtual
                 processors; Message-based parallel programs; Parallel
                 programming; Processor virtualization; PVM; Source-code
                 compatible PVM interface; SPMD-style PVM applications;
                 Standard PVM; UPVM; User-level process package; Virtual
                 processors",
  thesaurus =    "Message passing; Operating systems [computers];
                 Parallel programming; Performance evaluation; Resource
                 allocation",
  xxnote =       "Check author order.",
}

@InProceedings{Kramer-Fuhrmann:1994:TGP,
  author =       "O. Kramer-Fuhrmann and L. Schafers and C. Scheidler",
  title =        "{TRAPPER} --- a graphical programming environment for
                 parallel systems",
  crossref =     "Becks:1994:NCT",
  pages =        "3--15",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "German Nat. Res. Center for Comput. Sci., St.
                 Augustin, Germany",
  classification = "C5440 (Multiprocessing systems); C6115 (Programming
                 support); C6130B (Graphics techniques); C6180G
                 (Graphical user interfaces)",
  keywords =     "Communicating sequential processes; Configtool;
                 Designtool; Graphical programming environment;
                 Graphical representation; Hybrid program development;
                 Machine independent message passing interfaces;
                 Parallel applications; Parallel Macros; Parallel
                 systems; Parallel Virtual Machine; PARMACS; Perftool;
                 Programming model; PVM; Sequential behavior; TRAPPER;
                 Vistool; Visualization",
  thesaurus =    "Communicating sequential processes; Graphical user
                 interfaces; Parallel processing; Programming
                 environments",
}

@Article{Lazar:1994:SRE,
  author =       "A. A. Lazar and K. H. Tseng and Koon Seng Lim and W.
                 Choe",
  title =        "A scalable and reusable emulator for evaluating the
                 performance of {SS7} networks",
  journal =      j-IEEE-J-SEL-AREAS-COMMUN,
  volume =       "12",
  number =       "3",
  pages =        "395--404",
  month =        apr,
  year =         "1994",
  CODEN =        "ISACEM",
  DOI =          "https://doi.org/10.1109/49.285300",
  ISSN =         "0733-8716 (print), 1558-0008 (electronic)",
  ISSN-L =       "0733-8716",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. Eng., Columbia Univ., New York, NY,
                 USA",
  classification = "B6150C (Switching theory); B6210 (Telecommunication
                 applications); C5620 (Computer networks and
                 techniques); C5670 (Network performance); C7410F
                 (Communications)",
  fjournal =     "IEEE Journal on Selected Areas in Communications",
  keywords =     "ATM LAN; Emulator design; Engineering workstations;
                 Fault conditions; OSI Management Information Service
                 platform; OSIMIS; Parallel Virtual Machine; Performance
                 evaluation; Public domain software; Reusable emulator;
                 Scalable emulator; Singapore; SS7 networks; Unbalanced
                 loading conditions",
  thesaurus =    "Asynchronous transfer mode; Open systems; Performance
                 evaluation; Public domain software; Telecommunication
                 signalling; Telecommunications computing",
}

@TechReport{Lehman:1994:IZP,
  author =       "Li-wei Lehman",
  title =        "Integrating {Zipcode} and {PVM}: towards a
                 higher-level message-passing environment",
  type =         "Technical report",
  number =       "MSSU-EIRS-ERC 94-2",
  institution =  "Engineering Research Center for Computational Field
                 Simulation, " # inst-MSU,
  address =      inst-MSU:adr,
  pages =        "7",
  year =         "1994",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "PVM (Computer program); Telecommunications --- Message
                 processing.; Zipcode (Computer program)",
}

@InProceedings{Lin:1994:DNC,
  author =       "Mengjou Lin and Jehwei Hsieh and D. H. C. Du and J. P.
                 Thomas and J. A. MacDonald",
  title =        "Distributed network computing over local {ATM}
                 networks",
  crossref =     "IEEE:1994:PSW",
  pages =        "154--163",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Minnesota Univ., Duluth, MN,
                 USA",
  classification = "C5620L (Local area networks); C6150E (General
                 utility programs); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  keywords =     "Application programming interfaces; ASX-100 ATM
                 Switch; Asynchronous transfer mode; BSD socket
                 programming interface; Communication protocol layer;
                 Distributed network computing; Distributed programming;
                 End-to-end communication; Fore Systems ATM API;
                 High-speed network standards; Local ATM networks;
                 Message passing library; Parallel matrix
                 multiplication; Parallel Virtual Machine; Performance
                 characteristics; Remote Procedure Call; Switch-based
                 high-speed local area networks; Workstations",
  thesaurus =    "Application program interfaces; Asynchronous transfer
                 mode; Local area networks; Matrix multiplication;
                 Message passing; Telecommunication standards",
}

@InProceedings{Loh:1994:ISR,
  author =       "B. C. Loh and G. A. Manson",
  title =        "Incorporating software reuse into the {PCSC}
                 methodology",
  crossref =     "deGloria:1994:TAS",
  pages =        "929--941",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Sheffield Univ., UK",
  classification = "C6110B (Software engineering techniques); C6140D
                 (High level languages)",
  keywords =     "Abstract representations; Occam 2; Parallel
                 Communicating Sequential Code; PCSC methodology;
                 Programming languages; PVM C; Software component reuse;
                 Software reuse",
  thesaurus =    "Occam; Software reusability",
}

@InProceedings{Lonsdale:1994:CMH,
  author =       "G. Lonsdale and J. Clinckemaillie and S. Vlachoutsis
                 and J. Dubois",
  title =        "Crash-simulation migration to {HPC} systems",
  crossref =     "Dekker:1994:MPP",
  pages =        "439--446",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "ESI GmbH, Eschborn, Germany",
  classification = "C4185 (Finite element analysis); C6110P (Parallel
                 programming); C7440 (Civil and mechanical engineering
                 computing); C7480 (Production engineering computing)",
  keywords =     "Algorithmic parallelization; Automatic
                 parallelization; Automobile crashworthiness simulation
                 program; Automobile testing; Body shell deformation
                 calculation; CAD; CAMAS; Car crash simulation; Computer
                 Aided Migration of Applications System; Digital
                 simulation; Distributed-memory; FAM; Finite element
                 model; High performance computing; MIMD; PAM-CRASH;
                 PAM-STAMP; Parallel programming; Software portability",
  thesaurus =    "Accidents; Automobile industry; Automobiles; CAD/CAM;
                 Computer aided engineering; Digital simulation; Finite
                 element analysis; Mechanical engineering computing;
                 Parallel programming; Product development; Safety;
                 Software portability; Testing",
}

@InProceedings{Lonsdale:1994:CRP,
  author =       "G. Lonsdale and J. Clinckemaillie and S. Vlachoutsis
                 and J. Dubois",
  title =        "Communication requirements in parallel crashworthiness
                 simulation",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "55--61",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "ESI GmbH, Eschborn, Germany",
  classification = "C4185 (Finite element analysis); C5440
                 (Multiprocessing systems); C6150N (Distributed systems
                 software); C6185 (Simulation techniques); C7440 (Civil
                 and mechanical engineering computing)",
  keywords =     "Algorithmic features; Communication requirements;
                 Communications strategy design; Communications strategy
                 implementation; Distributed-memory MIMD machines;
                 Global communication; Message-passing; MPI standard;
                 Overhead minimisation; PAM-CRASH industrial
                 crashworthiness simulation program, PAM-CRASH; Parallel
                 crashworthiness simulation; Parallelization approach;
                 PARMACS; Portable message-passing interfaces; PVM",
  thesaurus =    "Application program interfaces; Digital simulation;
                 Distributed memory systems; Finite element analysis;
                 Message passing; Nonlinear dynamical systems; Parallel
                 processing; Structural engineering computing",
}

@InProceedings{Maffeis:1994:SSD,
  author =       "S. Maffeis",
  title =        "System support for distributed computing",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "293--301",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Zurich Univ., Switzerland",
  classification = "C6110J (Object-oriented programming); C6115
                 (Programming support); C6150N (Distributed systems
                 software)",
  keywords =     "Distributed computing; Distributed failure-resilient
                 applications; Distributed parallel computing; ELECTRA;
                 LINDA systems; Object-groups; Object-oriented
                 communication; Object-oriented programming; PVM;
                 Reliable multicast; Toolkit; Transputer system",
  thesaurus =    "Distributed processing; Object-oriented programming;
                 Software fault tolerance; Software tools",
}

@InProceedings{Malony:1994:PAP,
  author =       "A. Malony and B. Mohr and P. Beckman and D. Gannon and
                 S. Yang and F. Bodin",
  title =        "Performance analysis of {pC++}: a portable
                 data-parallel programming system for scalable parallel
                 computers",
  crossref =     "Siegal:1994:PEI",
  pages =        "75--84",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. and Inf. Sci., Oregon Univ., Eugene,
                 OR, USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C6120 (File
                 organisation); C6140D (High level languages); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems)",
  keywords =     "C++ language extension; Concurrent aggregate
                 collection classes; Distributed data structures; Embar;
                 Fast Poisson solver; Memory hierarchy; NAS suite;
                 Nearest neighbor grid computation; Parallel execution
                 semantics; Parallel machine; PC++; Performance
                 analysis; Performance tools; Portable data-parallel
                 programming system; Scalability measurements; Scalable
                 parallel computers; Sparse codes",
  thesaurus =    "C language; Data structures; Parallel languages;
                 Parallel machines; Parallel programming; Performance
                 evaluation; Program testing; Software portability",
}

@MastersThesis{Manchek:1994:DIP,
  author =       "Robert J. Manchek",
  title =        "Design and implementation of {PVM} version 3",
  type =         "M.S. thesis",
  school =       inst-UTK,
  address =      inst-UTK:adr,
  pages =        "viii + 81",
  year =         "1994",
  bibdate =      "Mon Jan 15 18:16:58 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer networks; Computer software.; Parallel
                 computers.",
}

@InProceedings{Marin:1994:GAL,
  author =       "F. J. Marin and O. Trelles-Salazar and F. Sandoval",
  title =        "Genetic Algorithms on {LAN-Message} Passing
                 Architectures Using {PVM}: Application to the Routing
                 Problem",
  crossref =     "Davidor:1994:PPS",
  pages =        "534--545 (or 534--543??)",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. de Arquitectura y Tecnologia de Computadores y
                 Electronica, Malaga Univ., Spain",
  classification = "B6150P (Communication network design and planning);
                 B6210L (Computer communications); C1180 (Optimisation
                 techniques); C4240C (Computational complexity); C4240P
                 (Parallel programming and algorithm theory); C5220P
                 (Parallel architecture); C5620L (Local area networks);
                 C6115 (Programming support); C6150N (Distributed
                 systems software); C7410F (Communications computing);
                 C7430 (Computer engineering)",
  corpsource =   "Dept. de Arquitectura y Tecnologfa de Computadores y
                 Electronica, Malaga Univ., Spain",
  keywords =     "allocation; area networks; combinatorial;
                 Combinatorial complexity; communication; Communication
                 latency; complexity; computational complexity; computer
                 architectures; computing; data communication; data
                 communications; Data communications; data-passing load;
                 Data-passing load; dynamic load balancing; Dynamic load
                 balancing; fault; fault tolerant; Fault tolerant
                 capabilities; genetic algorithm parallelization;
                 Genetic algorithm parallelization; genetic algorithms;
                 independent functions; Independent functions;
                 information; Information redistribution; integrated
                 software; integration tool; LAN-based message passing;
                 LAN-based message passing computer architectures;
                 latency; local; master node; Master node; message
                 passing; optimization problem; Optimization problem;
                 Parallel; parallel algorithms; parallel architectures;
                 Parallel Virtual Machine; partial results reporting;
                 Partial results reporting; public domain software;
                 Public domain software; PVM 3.1; redistribution;
                 resource; routing problem; Routing problem; server
                 processors; Server processors; sockets; Sockets;
                 software; Software integration tool; software tools;
                 telecommunication computing; telecommunication network
                 routing; tolerant capabilities; Virtual Machine;
                 virtual machines",
  pubcountry =   "Germany",
  thesaurus =    "Computational complexity; Data communication; Fault
                 tolerant computing; Genetic algorithms; Integrated
                 software; Local area networks; Message passing;
                 Parallel algorithms; Parallel architectures; Public
                 domain software; Resource allocation; Software tools;
                 Telecommunication computing; Telecommunication network
                 routing; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Mattson:1994:PEP,
  author =       "T. G. Mattson",
  title =        "Programming Environments for Parallel Computing: a
                 Comparison of {CPS}, {Linda}, {P4}, {PVM}, {POSYBL}, and
                 {TCGMSG}",
  crossref =     "Hesham:1994:PTS",
  volume =       "II",
  pages =        "586--594",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Sci. Comput., Beaverton, OR, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  corpsource =   "Intel Sci. Comput., Beaverton, OR, USA",
  keywords =     "Communication times; communication times; Cooperative
                 Processes Software; CPS; Environment utility;
                 environment utility; environments; Ethernet network;
                 four-; Four-node communications tests; Linda; node
                 communications tests; P4; parallel programming;
                 performance evaluation; Portable parallel programming
                 environments; portable parallel programming
                 environments; POSYBL; programming; PVM;
                 Reproducibility; reproducibility; software portability;
                 SPARCstation 1; SPARCstation 1 workstations; TCGMSG;
                 Theoretical Chemistry Group Message-passing system;
                 Two-node communication benchmarks; two-node
                 communication benchmarks; workstations",
  sponsororg =   "IEEE; ACM; Univ. Hawaii; Univ. Hawaii Coll. Bus.
                 Admin",
  thesaurus =    "Parallel programming; Performance evaluation;
                 Programming environments; Software portability",
  treatment =    "P Practical; X Experimental",
}

@Article{Matyska:1994:DCS,
  author =       "Lud{\u{e}}k Matyska and Jaroslav Ko{\v{c}}a",
  title =        "{D-CICADA}: a software for conformational {PES}
                 elucidation on network of workstations",
  journal =      j-J-COMPUT-CHEM,
  volume =       "15",
  number =       "9",
  pages =        "937--946",
  month =        sep,
  year =         "1994",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.540150904",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Thu Nov 29 14:54:27 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci., Masaryk Univ., Brno, Czech
                 Republic",
  classification = "A3115 (General mathematical and computational
                 developments for atoms and molecules); A3190 (Other
                 topics in the theory of atoms and molecules); A3520B
                 (General molecular conformation and symmetry; C6110P
                 (Parallel programming); C7320 (Physics and chemistry
                 computing); stereochemistry)",
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  keywords =     "CICADA; Conformational potential energy hypersurface;
                 Cyclohexane; D-CICADA software; DEC workstations;
                 Distributed environment; Parallel virtual machine;
                 Parallelization; Polynomial time; Sun workstations;
                 Terminally blocked alanine; Virtual machines",
  onlinedate =   "7 Sep 2004",
  thesaurus =    "Organic compounds; Organic molecule configurations;
                 Parallel programming; Physics computing; Potential
                 energy curves and surfaces of molecules; Virtual
                 machines",
}

@InProceedings{McKenzie:1994:CIM,
  author =       "N. R. McKenzie and K. Bolding and C. Ebeling and L.
                 Snyder",
  title =        "{CRANIUM}: An Interface for Message Passing on
                 Adaptive Packet Routing Networks",
  crossref =     "Bolding:1994:PCR",
  pages =        "266--280",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6150C (Communication switching); B6210L (Computer
                 communications); C4230M (Multiprocessor
                 interconnection); C5220P (Parallel architecture);
                 C5610N (Network interfaces); C5620 (Computer networks
                 and techniques)",
  conftitle =    "Parallel Computer Routing and Communication. First
                 International Workshop, PCRCW '94",
  corpsource =   "Dept. of Comput. Sci. and Eng., Washington Univ.,
                 Seattle, WA, USA",
  keywords =     "adaptive packet routing networks; arbitrary sequence;
                 automatic- receive interface; buffer addresses;
                 Cranium; interconnection network; message passing;
                 multiprocessor interconnection networks; network
                 interface; network interfaces; packet serialization;
                 packet switching; physical node identifiers;
                 processor-initiated interface; processor-network
                 interface; user-level programs",
  treatment =    "P Practical",
}

@Article{McKinney:1994:PGU,
  author =       "G. W. McKinney",
  title =        "A practical guide to using {MCNP} with {PVM}",
  journal =      j-TRANS-AM-NUCL-SOC,
  volume =       "71",
  number =       "????",
  pages =        "397--398",
  month =        "????",
  year =         "1994",
  CODEN =        "TANSAO",
  ISSN =         "0003-018X",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Los Alamos Nat. Lab., NM, USA",
  classification = "A2880F (Radiation monitoring and radiation
                 protection); C6150N (Distributed systems software);
                 C7470 (Nuclear engineering computing)",
  conflocation = "Washington, DC, USA; 13-17 Nov. 1994",
  conftitle =    "1994 Winter Meeting of American Nuclear Society
                 (papers in summary form only received)",
  corpsource =   "Los Alamos Nat. Lab., NM, USA",
  fjournal =     "Transactions of the American Nuclear Society",
  keywords =     "distributed memory systems; distributed-memory
                 multiprocessing; Distributed-memory multiprocessing;
                 engineering computing; MCNP; Monte Carlo methods;
                 nuclear; PVM; radiation protection",
  thesaurus =    "Distributed memory systems; Monte Carlo methods;
                 Nuclear engineering computing; Radiation protection",
  treatment =    "P Practical",
}

@InProceedings{Miller:1994:PPP,
  author =       "B. P. Miller and J. K. Hollingsworth and M. D.
                 Callaghan",
  title =        "The {Paradyn} parallel performance tools and {PVM}",
  crossref =     "Dongarra:1994:PSW",
  pages =        "201--210",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6115 (Programming support); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems software)",
  corpsource =   "Wisconsin Univ., Madison, WI, USA",
  keywords =     "applications; automated bottleneck searching; dynamic;
                 heterogeneous program measurement; instrumentation;
                 large-scale parallel applications; long-; machines;
                 native PVM; Paradyn; parallel performance tools;
                 parallel programming; performance problem causes;
                 production-sized data sets; program diagnostics;
                 running applications; software metrics; software
                 performance evaluation; software tools; Sun; Thinking
                 Machine CM-5; virtual; workstations",
  treatment =    "P Practical",
}

@InProceedings{Miller:1994:PPT,
  author =       "B. P. Miller and J. K. Hollingworth and M. D.
                 Callaghan",
  title =        "The {Paradyn} Performance Tools and {PVM}",
  crossref =     "Dongarra:1994:PSW",
  pages =        "201--210",
  year =         "1994",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@MastersThesis{Nemer-Preece:1994:LBH,
  author =       "Nicole Anne Nemer-Preece",
  title =        "Load balancing the heat equation in a heterogeneous
                 environment with {PVM}",
  type =         "M.S. thesis",
  school =       "University of Missouri, Rolla",
  address =      "Rolla, MO, USA",
  pages =        "viii + 52",
  year =         "1994",
  bibdate =      "Mon Jan 15 18:17:04 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Neun:1994:UPB,
  author =       "W. Neun",
  title =        "Using {PVM} based software for parallel computation in
                 Computer Algebra",
  crossref =     "Calmet:1994:RWC",
  pages =        "46--51",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Konrad-Zuse-Zentrum fur Informationstech. Berlin,
                 Germany",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C7310 (Mathematics computing)",
  corpsource =   "Konrad-Zuse-Zentrum fur Informationstech. Berlin,
                 Germany",
  keywords =     "computer algebra; Computer algebra; manipulation;
                 mathematics computing; network topology; Network
                 topology; parallel algorithms; Parallel algorithms;
                 parallel computation; Parallel computation; parallel
                 machines; Parallel machines; performance benefit;
                 Performance benefit; PVM based software; symbol",
  pubcountry =   "Germany",
  sponsororg =   "Univ. Karlsruhe",
  thesaurus =    "Mathematics computing; Parallel algorithms; Symbol
                 manipulation",
  treatment =    "P Practical",
}

@InProceedings{Nguyen:1994:DCE,
  author =       "S. T. Nguyen and B. J. Zook and Xiaodong Zhang",
  title =        "Distributed computation of electromagnetic scattering
                 problems using finite-difference time-domain
                 decompositions",
  crossref =     "IEEE:1994:PTI",
  pages =        "85--93",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Instrum. and Space Res., Southwest Res.
                 Inst., San Antonio, TX, USA",
  classification = "A0260 (Numerical approximation and analysis); A4110H
                 (Electromagnetic waves: theory); B0290P (Differential
                 equations); B5210 (Electromagnetic wave propagation);
                 C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory); C7320 (Physics and
                 chemistry computing)",
  keywords =     "Communication pattern variations; Computing
                 performance; Distributed computation; Distributed
                 memory; Distributed workstation network;
                 Electromagnetic scattering problems; Finite-difference
                 time-domain decompositions; Load balancing; Numerical
                 method; Parallelism; Partial differential equations;
                 PVM; Scalability",
  thesaurus =    "Distributed algorithms; Distributed memory systems;
                 Electromagnetic wave scattering; Finite difference
                 time-domain analysis; Partial differential equations;
                 Physics computing",
}

@InProceedings{Nordling:1994:SOD,
  author =       "P. Nordling and P. Fritzson",
  title =        "Solving ordinary differential equations on parallel
                 computers --- applied to dynamic rolling bearings
                 simulation",
  crossref =     "Dongarra:1994:PSC",
  pages =        "397--415",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. and Inf. Sci., Linkoping Univ.,
                 Sweden",
  classification = "C4170 (Differential equations); C6150N (Distributed
                 systems software); C6185 (Simulation techniques); C7440
                 (Civil and mechanical engineering computing)",
  keywords =     "Dynamic rolling bearings simulation; Ethernet;
                 Fine-grained synchronization; Initial value problems;
                 LSODA; MIMD parallel computers; Ordinary differential
                 equation solution; Parallelism; PARIX operating system;
                 Parsytec GigaCube; PVM; Solaris 2.3; SPARC 10
                 workstation cluster; Speedup; Sun SPARCcenter 2000",
  thesaurus =    "Differential equations; Digital simulation; Initial
                 value problems; Machine bearings; Mechanical
                 engineering computing; Parallel processing",
}

@InProceedings{Otto:1994:PVM,
  author =       "S. W. Otto",
  title =        "Processor Virtualization and Migration for {PVM}",
  crossref =     "Dongarra:1994:PSW",
  pages =        "66--75",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software); C7430 (Computer engineering)",
  corpsource =   "Dept. of Comput. Sci. and Eng., Oregon Graduate Inst.
                 of Sci. and Technol., Portland, OR, USA",
  keywords =     "context switch; distributed memory systems;
                 distributed scheduling systems; interoperability; local
                 communication speeds; Machine; machines; message
                 passing; Migratable PVM; multi; parallel; Parallel
                 Virtual; performance figures; process granularity;
                 process level MPVM; processor virtualization;
                 programming model; run realistic applications; semantic
                 restrictions; threaded version; times; transparent
                 migration; transparent migration mechanisms; virtual
                 machines; work migration",
  treatment =    "P Practical",
}

@Article{Phan-Thien:1994:CDL,
  author =       "N. Phan-Thien and D. Tullock",
  title =        "Completed double layer boundary element method in
                 elasticity and {Stokes} flow: Distributed computing
                 through {PVM}",
  journal =      j-COMP-MECH,
  volume =       "14",
  number =       "4",
  pages =        "370--383",
  month =        jul,
  year =         "1994",
  CODEN =        "CMMEEE",
  ISSN =         "0178-7675",
  bibdate =      "Sat Apr 06 15:05:19 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Pierce:1994:NMP,
  author =       "P. Pierce",
  title =        "The {NX} message passing interface",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "4",
  pages =        "463--480",
  month =        apr,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems)",
  corpsource =   "Intel Corp., Beaverton, OR, USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "communication model; design tradeoffs; distributed
                 memory systems; high performance; high performance
                 interface; Intel multicomputers; massively parallel
                 distributed memory supercomputers; message passing;
                 multicomputer message passing; NX interface; NX message
                 passing interface; parallel applications; parallel
                 programming; performance; programming environments;
                 typed send/receive model; usability; vendor- supplied
                 programming interface",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@InProceedings{Pierce:1994:PIN,
  author =       "P. Pierce and G. Regnier",
  title =        "The {Paragon} implementation of the {NX} message
                 passing interface",
  crossref =     "Pierce:1994:PSH",
  pages =        "184--190",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C6115
                 (Programming support); C6150J (Operating systems);
                 C6150N (Distributed systems)",
  conftitle =    "Proceedings of IEEE Scalable High Performance
                 Computing Conference",
  corpsource =   "Intel Supercomput. Syst. Div., Beaverton, OR, USA",
  keywords =     "hardware; high performance message passing; message
                 passing; message passing design; message passing
                 interface; NX; operating system; operating systems
                 (computers); OSF/1; Paragon; parallel architectures;
                 parallel machines; parallel supercomputer; performance;
                 programming environment; programming environments",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Supercomput.
                 Appl",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Pozo:1994:FTE,
  author =       "R. Pozo and K. Remington",
  title =        "Fast three-dimensional elliptic solvers on distributed
                 network clusters",
  crossref =     "Joubert:1994:PCT",
  pages =        "201--208",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  classification = "C4130 (Interpolation and function approximation);
                 C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory)",
  keywords =     "Distributed network clusters; Elliptic solvers;
                 Object-oriented message passing interface; Parallel
                 architecture; Parallel architectures; Spline
                 collocation",
  thesaurus =    "Distributed algorithms; Elliptic equations; Splines
                 [mathematics]",
}

@InProceedings{Puthukattukaran:1994:DIP,
  author =       "J. Puthukattukaran and S. Chalasani and P. Senapathy",
  title =        "Design and implementation of parallel algorithms for
                 gene-finding",
  crossref =     "IEEE:1994:PTI",
  pages =        "186--193",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. and Comput. Eng., Wisconsin Univ.,
                 Madison, WI, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessing systems); C7330 (Biology and
                 medical computing)",
  keywords =     "CM-5 multicomputer; DNA sequences; Gene-finding; HP
                 Apollo workstations; Human Genome project; Parallel
                 algorithm; Parallel algorithm design; Parallel
                 gene-finding algorithm; Parallel Virtual Machine; PVM;
                 Serial algorithm; Software package",
  thesaurus =    "Biology computing; Cellular biophysics; DNA; Parallel
                 algorithms; Parallel machines",
}

@Article{Reale:1994:PCU,
  author =       "F. Reale and F. Bocchino and S. Sciortino",
  title =        "Parallel computing on {Unix} workstation arrays",
  journal =      j-COMP-PHYS-COMM,
  volume =       "83",
  number =       "2-3",
  pages =        "130--140",
  month =        dec,
  year =         "1994",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Istituto e Osservatorie Astron., Palermo, Italy",
  classification = "A9575P (Mathematical and computer techniques in
                 astronomy); C5620L (Local area networks); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6150J (Operating systems); C6150N (Distributed systems
                 software); C7350 (Astronomy and astrophysics
                 computing)",
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
  keywords =     "2D hydrodynamic code; Alpha processors; Astrophysical
                 flows; Data-domain decomposition; DECstations 3000/400;
                 DECstations 5000/200; Dedicated MIMD parallel system;
                 Ethernet LAN; FDDI LAN; Intel i860 processors; Massive
                 parallel computations; Meiko Computing Surface; MIMD
                 systems; Network bandwidth; Nondedicated parallel
                 systems; Parallel computing; Parallelization library;
                 Processor power; PVM software toolset; Software; Unix
                 workstation arrays",
  pubcountry =   "Netherlands",
  thesaurus =    "Astronomy computing; Astrophysical fluid dynamics;
                 Local area networks; Message passing; Parallel
                 programming; Protocols; Software packages; Unix;
                 Workstations",
}

@InProceedings{Rolfe:1994:PAP,
  author =       "T. J. Rolfe",
  title =        "{PVM}: An Affordable Parallel Processing Environment",
  crossref =     "Anonymous:1994:SCC",
  pages =        "118--125",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Saarinen:1994:EES,
  author =       "S. Saarinen",
  title =        "{EASYPVM} --- An Enhanced Subroutine Library for
                 {PVM}",
  crossref =     "Gentzsch:1994:HPC",
  volume =       "2",
  pages =        "267--272",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Center for Sci. Comput., Espoo, Finland",
  classification = "C5440 (Multiprocessing systems); C6110B (Software
                 engineering techniques); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C7430 (Computer
                 engineering)",
  corpsource =   "Center for Sci. Comput., Espoo, Finland",
  keywords =     "approach; clear message passing programming; Clear
                 message passing programming approach; EASYPVM; Enhanced
                 subroutine library; enhanced subroutine library; global
                 communication; Global communication routines; library;
                 message passing; Message passing calls; message passing
                 calls; Message passing library; parallel machines;
                 parallel programming; Parallel virtual machine;
                 parallel virtual machine; PICL/ParaGraph message
                 tracing postprocessor; Process creation; process
                 creation; PVM; PVM message passing; PVM message passing
                 syntax; routines; software libraries; syntax; virtual
                 machines",
  pubcountry =   "Germany",
  thesaurus =    "Message passing; Parallel machines; Parallel
                 programming; Software libraries; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Scales:1994:DES,
  author =       "D. J. Scales and M. S. Lam",
  title =        "The design and evaluation of a shared object system
                 for distributed memory machines",
  crossref =     "USENIX:1994:PFU",
  pages =        "101--114",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Comput. Syst. Lab., Stanford Univ., CA, USA",
  classification = "C5440 (Multiprocessing systems); C6120 (File
                 organisation); C6150N (Distributed systems software)",
  keywords =     "Automatic caching; CM-5; Data access; Data prefetch;
                 Distributed memory machines; Global name space; High
                 communication overheads; IBM SP1; Intel iPSC/860;
                 Paragon; Parallel algorithm; Performance; Portable
                 run-time system; Remote processors; SAM; Scientific
                 algorithms; Shared data; Shared object system;
                 Synchronization; System design; System evaluation;
                 Workstations",
  thesaurus =    "Cache storage; Distributed memory systems; Operating
                 systems [computers]; Parallel algorithms; Parallel
                 machines; Shared memory systems; Synchronisation",
}

@Article{Schmidt:1994:EAO,
  author =       "B. K. Schmidt and V. S. Sunderam",
  title =        "Empirical analysis of overheads in cluster
                 environments",
  journal =      j-CPE,
  volume =       "6",
  number =       "1",
  pages =        "1--32",
  month =        feb,
  year =         "1994",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  classification = "C4230M (Multiprocessor interconnection); C5220P
                 (Parallel architecture); C5470 (Performance evaluation
                 and testing)",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "Cluster environments; Communication delay; Concurrent
                 computing; Heterogeneous processing elements; Load
                 imbalance; Parallelism model; Partitioning strategies;
                 Performance; PVM network computing system; Throughput",
  pubcountry =   "UK",
  thesaurus =    "Multiprocessing systems; Multiprocessor
                 interconnection networks; Performance evaluation",
}

@InProceedings{Schmidt:1994:IAP,
  author =       "M. Schmidt and R. Hanisch",
  title =        "Implementation of an air pollution transport model on
                 parallel hardware",
  crossref =     "Dekker:1994:MPP",
  pages =        "277--284",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "GMD-FIRST, Berlin, Germany",
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software); C6185 (Simulation techniques); C7320
                 (Physics and chemistry computing)",
  keywords =     "Air pollution analysis; Air pollution transport model;
                 Berlin; Complex numerical models; Conurbations; MANNA
                 computer; Operational management; Parallel computer;
                 Parallel hardware; Programming interface; PVM; Run time
                 measurements; Simulation environment; Simulation
                 system; Smog situations; Urban planning",
  thesaurus =    "Air pollution; Digital simulation; Flow simulation;
                 Message passing; Parallel machines; Parallel
                 programming; Physics computing; Town and country
                 planning; Transport processes",
}

@TechReport{Schneenman:1994:DSS,
  author =       "Richard D. Schneenman",
  title =        "Distributed supercomputing software: experiences with
                 the parallel virtual machine --- {PVM}",
  number =       "NISTIR 5381",
  institution =  "U.S. Dept. of Commerce, National Institute of
                 Standards and Technology",
  address =      "Gaithersburg, MD, USA",
  pages =        "vi + 18",
  year =         "1994",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  annote =       "March 1994.",
}

@Article{Schoinas:1994:FGA,
  author =       "Ioannis Schoinas and Babak Falsafi and Alvin R. Lebeck
                 and Steven K. Reinhardt and James R. Larus and David A.
                 Wood",
  title =        "Fine-grain access control for distributed shared
                 memory",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "11",
  pages =        "297--306",
  month =        nov,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat May 1 15:50:17 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p297-schoinas/",
  abstract =     "This paper discusses implementations of fine-grain
                 memory access control, which selectively restricts
                 reads and writes to cache-block-sized memory regions.
                 Fine-grain access control forms the basis of efficient
                 cache-coherent shared memory. This paper focuses on
                 low-cost implementations that require little or no
                 additional hardware. These techniques permit efficient
                 implementation of shared memory on a wide range of
                 parallel systems, thereby providing shared-memory codes
                 with a portability previously limited to message
                 passing. This paper categorizes techniques based on
                 where access control is enforced and where access
                 conflicts are handled. We incorporated three techniques
                 that require no additional hardware into Blizzard, a
                 system that supports distributed shared memory on the
                 CM-5. The first adds a software lookup before each
                 shared-memory reference by modifying the program's
                 executable. The second uses the memory's error
                 correcting code (ECC) as cache-block valid bits. The
                 third is a hybrid. The software technique ranged from
                 slightly faster to two times slower than the ECC
                 approach. Blizzard's performance is roughly comparable
                 to a hardware shared-memory machine. These results
                 argue that clusters of workstations or personal
                 computers with networks comparable to the CM-5's will
                 be able to support the same shared-memory interfaces as
                 supercomputers.",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5320G
                 (Semiconductor storage); C5440 (Multiprocessing
                 systems); C6120 (File organisation)",
  conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
  conftitle =    "Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI)",
  corpsource =   "Dept. of Comput. Sci., Wisconsin Univ., Madison, WI,
                 USA",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "access conflicts; Blizzard; block-sized memory
                 regions; cache block valid bits; cache storage;
                 cache-block-sized memory regions; cache-coherent shared
                 memory; CM-5; design; distributed memory systems;
                 distributed shared memory; ECC approach; error
                 correcting code; fine-grain access control; low-cost
                 implementations; measurement; parallel machines;
                 parallel systems; performance; portability; security;
                 shared memory codes; shared memory interfaces; shared
                 memory reference; shared memory systems; software
                 lookup; storage management; supercomputers; theory",
  sponsororg =   "ACM; IEEE Comput. Soc",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Shared memory. {\bf D.4.2} Software, OPERATING
                 SYSTEMS, Storage Management, Distributed memories. {\bf
                 D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Access controls.",
  treatment =    "P Practical",
}

@InProceedings{Seyfarth:1994:GEE,
  author =       "B. R. Seyfarth and J. L. Bickham and M. R. Fernandez",
  title =        "Glenda: an environment for easy parallel programming",
  crossref =     "Pierce:1994:PSH",
  pages =        "637--641",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Southern Mississippi Univ.,
                 Hattiesburg, MS, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C7320 (Physics and Chemistry); C7430
                 (Computer engineering)",
  keywords =     "Benchmark; C programming language; Communication
                 functions; Coordination language; Glenda; Global tuple
                 space; Parallel programming environment; Parallel
                 Virtual Machine; Preprocessor; PVM message passing
                 functions; Software package; Tuple server process;
                 Underwater acoustic modeling",
  thesaurus =    "Acoustic analysis; File servers; Message passing;
                 Parallel programming; Physics computing; Programming
                 environments; Underwater sound; Virtual machines",
}

@InProceedings{Shee:1994:DMA,
  author =       "Jang Chung Shee and Chao Chin Wu and Lin Wen You and
                 Cheng Chen",
  title =        "Design of a multithread architecture and its parallel
                 simulation and evaluation environment",
  crossref =     "Anonymous:1994:ICS",
  pages =        "69--76 (vol. 1)",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci. and Inf. Eng., Nat. Chiao Tung
                 Univ., Hsinchu, Taiwan",
  classification = "C5220P (Parallel architecture); C6115 (Programming
                 support); C6185 (Simulation techniques)",
  keywords =     "Context switch; Integrated multiprocessing simulation
                 environment; Multithread architecture; Parallel
                 simulation; Parallel simulation and evaluation
                 environment; Parallel Virtual Machine; SUN SPARC
                 workstations; Thread-related instructions",
  thesaurus =    "Digital simulation; Parallel architectures;
                 Programming environments",
}

@InProceedings{Shelton:1994:FPS,
  author =       "W. A. Shelton and G. M. Stocks and F. J. Pinski and R.
                 G. Jordan and Y. Liu and L. Qui and J. B. Staunton and
                 D. D. Johnson and B. Ginatempo",
  title =        "First principles simulation of materials properties",
  crossref =     "Pierce:1994:PSH",
  pages =        "103--110",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Nat. Lab., TN, USA",
  classification = "A3100 (Theory of atoms and molecules); C5440
                 (Multiprocessor systems and techniques); C6110P
                 (Parallel programming); C6185 (Simulation techniques);
                 C7320 (Physics and Chemistry)",
  keywords =     "Ag-Mg alloy system; Electronic origin; Electronic
                 structure; First principles simulation; High
                 performance workstations; Local computer environment;
                 Materials properties; Order-disorder temperature;
                 Ordered materials; Parallel computer code; Physics
                 computing; PVM3 3; Short-range order intensity;
                 Substitutionally disordered materials; Vector
                 supercomputers",
  thesaurus =    "Digital simulation; Fermi surface; Materials
                 properties; Molecular electronic states; Parallel
                 machines; Parallel programming; Physics; Physics
                 computing",
}

@InProceedings{Shing:1994:UPC,
  author =       "C.-C. Shing",
  title =        "Use {PVM} on computation of analysis of repeated
                 measurement designs",
  crossref =     "Sall:1994:CIS",
  pages =        "139--142",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C1140Z (Other topics in statistics); C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C7310 (Mathematics computing)",
  corpsource =   "Dept. of Comput. Sci., Radford Univ., VA, USA",
  keywords =     "computation; concurrent; designed experiments;
                 distributed memory; distributed memory systems;
                 heterogeneous network; parallel computer; parallel
                 programming; parallelized sweep operator; PVM;
                 regression; repeated measurement designs; software
                 package; statistical analysis; sweep operation",
  sponsororg =   "Interface Found. North America",
  treatment =    "P Practical",
}

@Article{Skjellum:1994:DEZ,
  author =       "A. Skjellum and S. G. Smith and N. E. Doss and A. P.
                 Leung and M. Morari",
  title =        "The design and evolution of {Zipcode}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "4",
  pages =        "565--596",
  day =          "31",
  month =        mar,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  classification = "C5440 (Multiprocessor systems and techniques);
                 C6110P (Parallel programming); C6120 (File
                 organisation); C6150N (Distributed systems)",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "Collective operations; Communication contexts;
                 Gather-send; Homogeneous computer networks; Large-scale
                 multicomputer software; Mailer data structure; Message
                 passing; MPI standard; Multicomputers; Point-to-point
                 communication; Process-management system;
                 Receive-scatter semantics; Runtime optimizations;
                 Static process groups; Virtual topologies; Zipcode",
  pubcountry =   "Netherlands",
  thesaurus =    "Data structures; Message passing; Multiprocessing
                 systems; Parallel programming",
}

@InProceedings{Skjellum:1994:WLM,
  author =       "A. Skjellum and N. E. Doss and P. V. Bangalore",
  title =        "Writing libraries in {MPI}",
  crossref =     "IEEE:1994:PSP",
  pages =        "166--173",
  year =         "1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  classification = "C4140 (Linear algebra); C6110P (Parallel
                 programming); C6150N (Distributed systems software);
                 C6150N (Distributed systems)",
  conftitle =    "Proceedings of Scalable Parallel Libraries
                 Conference",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  keywords =     "cluster; Cluster; code fragments; Code fragments;
                 linear algebra; linear algebra library; Linear algebra
                 library; message passing; Message passing;
                 message-passing systems; Message-passing systems; MPI;
                 multicomputer; Multicomputer; parallel libraries;
                 Parallel libraries; parallel programming; standard;
                 Standard; subroutines; virtual topology; Virtual
                 topology",
  sponsororg =   "Mississippi State Univ.; Nat. Sci. Found",
  thesaurus =    "Linear algebra; Message passing; Parallel programming;
                 Subroutines",
  treatment =    "P Practical",
}

@InProceedings{Sloot:1994:CIO,
  author =       "P. M. A. Sloot and A. G. Hoekstra and L. O.
                 Hertzberger",
  title =        "A comparison of the {Iserver-Occam}, {Parix},
                 {Express}, and {PVM} programming environments on a
                 {Parsytec GCel}",
  crossref =     "Gentzsch:1994:HPC",
  volume =       "2",
  pages =        "253--259",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240C (Computational complexity); C6110P (Parallel
                 programming); C6115 (Programming support)",
  corpsource =   "Dept. of Comput. Syst., Amsterdam Univ., Netherlands",
  keywords =     "communication capabilities; computational complexity;
                 development time; environments; Express; floating point
                 performance; global communication times; Iserver-Occam;
                 parallel programming; Parix; Parsytec GCel; point to
                 point communication; portability; programmability;
                 programming; PVM; software performance evaluation; time
                 complexity analysis; times",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Sloot:1994:CIP,
  author =       "P. M. A. Sloot and A. G. Hoekstra and L. O.
                 Hertzberger",
  title =        "A Comparison of the {Iserver-Occam}, {Parix},
                 {Express}, and {PVM} Programming Environments on a
                 {Parsytec GCel}",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "253--259",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Syst., Amsterdam Univ., Netherlands",
  classification = "C4240C (Computational complexity); C6110P (Parallel
                 programming); C6115 (Programming support)",
  keywords =     "Communication capabilities; Development time; Express;
                 Floating point performance; Global communication times;
                 Iserver-Occam; Parallel programming environments;
                 Parix; Parsytec GCel; Point to point communication
                 times; Portability; Programmability; PVM; Time
                 complexity analysis",
  thesaurus =    "Computational complexity; Parallel programming;
                 Programming environments; Software performance
                 evaluation",
}

@InProceedings{Stephens:1994:PBT,
  author =       "R. Stephens",
  title =        "Parallel benchmarks on the {Transtech Paramid}
                 supercomputer",
  crossref =     "deGloria:1994:TAS",
  pages =        "136--146",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing)",
  keywords =     "Application fields; Distributed memory parallel
                 supercomputer; GENESIS; Intel i860-XP processors; NAS
                 suites; Parallel benchmarks; PARMACS codes; Portable
                 parallel codes; Portable PVM; Transtech Paramid
                 supercomputer; Workstation clusters",
  thesaurus =    "Distributed memory systems; Parallel processing;
                 Performance evaluation",
}

@Article{Still:1994:PPC,
  author =       "C. H. Still",
  title =        "Portable parallel computing via the {MPI1}
                 message-passing standard",
  journal =      j-COMPUT-PHYS,
  volume =       "8",
  number =       "5",
  pages =        "533--536, 538--539",
  month =        sep # "--" # oct,
  year =         "1994",
  CODEN =        "CPHYE2",
  ISSN =         "0894-1866 (print), 1558-4208 (electronic)",
  ISSN-L =       "0894-1866",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Lasers and Energy Comput. Div., Lawrence Livermore
                 Nat. Lab., CA, USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150N (Distributed systems software)",
  fjournal =     "Computers in Physics",
  keywords =     "C languages binding; C++ bindings; Communicator;
                 Fortran binding; Functionality; Hardware;
                 Message-passing routine library; MPI1 message-passing
                 standard; Portable parallel computing; Receive routine;
                 Send routine; Vendor-independent message-passing
                 library",
  thesaurus =    "C language; FORTRAN; Message passing; Object-oriented
                 languages; Parallel programming",
}

@Article{Stone:1994:PSO,
  author =       "L. C. Stone and S. B. Shukla and B. Neta",
  title =        "Parallel satellite orbit prediction using a
                 workstation cluster",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "28",
  number =       "8",
  pages =        "1--8",
  month =        oct,
  year =         "1994",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Naval Postgraduate Sch., Monterey, CA, USA",
  classification = "A9385 (Instrumentation and techniques for
                 geophysical, hydrospheric and lower atmosphere
                 research); A9575P (Mathematical and computer
                 techniques); C5440 (Multiprocessor systems and
                 techniques); C7350 (Astronomy and astrophysics)",
  fjournal =     "Computers and Mathematics with Applications",
  keywords =     "Function decomposition techniques; Parallel computing;
                 Parallel satellite orbit prediction; Parallel Virtual
                 Machine; Performance metric; SUN workstations;
                 Workstation cluster",
  pubcountry =   "UK",
  thesaurus =    "Artificial satellites; Astronomy computing; Parallel
                 processing; Workstations",
}

@Article{Strok:1994:NJI,
  author =       "Dale C. Strok",
  title =        "In the News: {Jupiter} impacts: Resolution makes a big
                 difference. Supercomputer farming down under. {HPF
                 Forum} welcomes comments. {Smithsonian Awards} honor
                 computational scientists. Low-life computer viruses.
                 {PVM} developers get {R\&D-100} award. The eyes have
                 it. Neural nets detect breast cancer. Better cars
                 through cooperation. Parallel version of global climate
                 model. {Lockheed} to run {Idaho National Engineering
                 Lab}. Public-private partners: new drugs, new
                 software",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "1",
  number =       "3",
  pages =        "88--90",
  month =        "Fall",
  year =         "1994",
  CODEN =        "ISCEE4",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat May 25 13:29:25 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@InProceedings{Sunderam:1994:GPP,
  author =       "V. Sunderam",
  title =        "General Purpose Parallel Computing with {PVM}",
  crossref =     "Anonymous:1994:PPC",
  pages =        "185--198",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sunderam:1994:MSH,
  author =       "V. S. Sunderam",
  title =        "Methodologies and systems for heterogeneous concurrent
                 computing",
  crossref =     "Joubert:1994:PCT",
  pages =        "29--45",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6150N (Distributed systems software)",
  keywords =     "Heterogeneous concurrent computing; Parallel
                 algorithm; Parallel processing; Partitioning;
                 Performance aspects; PVM system; Scheduling",
  thesaurus =    "Parallel algorithms; Scheduling",
}

@Article{Sunderam:1994:PCC,
  author =       "V. S. Sunderam and G. A. Geist and J. Dongarra and R.
                 Manchek",
  title =        "The {PVM} concurrent computing system: Evolution,
                 experiences, and trends",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "4",
  pages =        "531--545",
  day =          "31",
  month =        mar,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:14:00 MDT 1999",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=4&aid=861",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  classification = "B6210L (Computer communications); C5620 (Computer
                 networks and techniques); C6110P (Parallel
                 programming); C6150N (Distributed systems)",
  corpsource =   "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "auxiliary facilities; Auxiliary facilities; case
                 studies; Case studies; climate modeling; Climate
                 modeling; communication overheads; Communication
                 overheads; computer networks; computing model;
                 Computing model; environmental science; Environmental
                 science; experimental enhancements; Experimental
                 enhancements; heterogeneous; heterogeneous concurrent
                 computing; Heterogeneous concurrent computing;
                 Heterogeneous networked computing platforms; interface;
                 large scale scientific supercomputing; Large scale
                 scientific supercomputing; materials science; Materials
                 science; message passing; message passing model;
                 Message passing model; MPP; MPP support; network
                 computing; Network computing; networked computing
                 platforms; networked environments; Networked
                 environments; parallel processing; Parallel processing;
                 parallel programming; process groups; Process groups;
                 programming; Programming interface; PVM concurrent
                 computing system; software framework; Software
                 framework; support",
  pubcountry =   "Netherlands",
  thesaurus =    "Computer networks; Message passing; Parallel
                 programming",
  treatment =    "P Practical",
}

@InProceedings{Sydow:1994:PSA,
  author =       "A. Sydow",
  title =        "Parallel simulation of air pollution",
  crossref =     "Pehrson:1994:IPP",
  journal =      j-IFIP-TRANS-A,
  volume =       "A-52",
  pages =        "605--612",
  year =         "1994",
  CODEN =        "ITATEC",
  ISSN =         "0926-5473",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "GMD-Res. Inst. for Comput. Archit. and Software
                 Technol., Berlin, Germany",
  classification = "A8670G (Atmosphere); A9260T (Air quality and air
                 pollution); A9365 (Data acquisition, processing and
                 storage); C6110P (Parallel programming); C7340
                 (Geophysics)",
  fjournal =     "IFIP Transactions. A. Computer Science and
                 Technology",
  keywords =     "Air pollutant transport models; Air pollution
                 modelling; Berlin-Brandenburg area, Germany; CM-5;
                 Equations; Eulerian models; FORGE; Lagrangian models;
                 MANNA; Meteorological models; Model domain
                 decomposition method; Model parallelization; Numerical
                 algorithms; Parallel hardware; Parallel simulation;
                 PARMACS; PVM; Runtime measurements; Software tools;
                 Transputer system; Workstation cluster",
  thesaurus =    "Air pollution; Digital simulation; Environmental
                 science computing; Geophysics computing; Numerical
                 analysis; Parallel processing; Software tools",
}

@InProceedings{Thomas:1994:PSA,
  author =       "S. J. Thomas and J. Cote",
  title =        "Parallel {Semi-Lagrangian} Advection using {PVM}",
  crossref =     "Dekker:1994:MPP",
  pages =        "801--808",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. de Recherche en Prevision Numerique,
                 Environnement Canada, Dorval, Que., Canada",
  classification = "A0260 (Numerical approximation and analysis); A0340G
                 (Fluid dynamics: general mathematical aspects); A4710
                 (General fluid dynamics theory, simulation and other
                 computational methods); C4160 (Numerical integration
                 and differentiation); C4240P (Parallel programming and
                 algorithm theory); C5440 (Multiprocessing systems);
                 C6150N (Distributed systems software); C7320 (Physics
                 and chemistry computing)",
  keywords =     "Computational fluid dynamics; Courant Friedrichs Lewy
                 condition; Distributed MIMD implementation; Eulerian
                 methods; Intel iPSC/860; Parallel algorithms; Parallel
                 performance; Parallel semi-Lagrangian advection;
                 Passive advection problem; Processor; PVM; Scalable
                 code; Sub-grid dimensions; Time steps",
  thesaurus =    "Distributed memory systems; Fluid dynamics;
                 Integration; Parallel algorithms; Physics computing;
                 Software performance evaluation; Transport processes",
}

@InProceedings{Thomsen:1994:RTS,
  author =       "P. G. Thomsen",
  title =        "Real time simulation in a cluster computing
                 environment",
  crossref =     "Dongarra:1994:PSC",
  pages =        "493--497",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. for Math. Modelling, Tech. Univ. Denmark,
                 Lyngby, Denmark",
  classification = "C5620L (Local area networks); C6150J (Operating
                 systems); C6185 (Simulation techniques); C7460
                 (Aerospace engineering computing); C7810C
                 (Computer-aided instruction)",
  keywords =     "Airplane flying; Cluster computing environment;
                 Differential algebraic equations; FDDI-ring;
                 Mathematical problem; Personnel training; PVM; Real
                 time simulation; Real time update; Ship manoeuvering;
                 Simulator design; Systems variables; Workstation
                 cluster",
  thesaurus =    "Aerospace simulation; Aircraft; Computer based
                 training; Digital simulation; FDDI; Local area
                 networks; Operating systems [computers]; Personnel;
                 Real-time systems; Ships; Workstations",
}

@InProceedings{Trefftz:1994:DPE,
  author =       "C. Trefftz and C. C. Huang and P. K. McKinley and T.
                 Y. Li and Z. Zeng",
  title =        "Design and performance evaluation of a distributed
                 eigenvalue solver on a workstation cluster",
  crossref =     "IEEE:1994:IPN",
  pages =        "608--615",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  classification = "C4140 (Linear algebra); C4240P (Parallel programming
                 and algorithm theory); C5470 (Performance evaluation
                 and testing); C6110P (Parallel programming)",
  keywords =     "Bisection algorithm; Distributed eigenvalue solver;
                 High-performance workstations; Interprocess
                 communication packages; Laguerre iteration; P4;
                 Parallel algorithm; Parallel scientific computing;
                 Parallel split-merge; Performance evaluation;
                 Performance study; PVM; Rank two splitting; Separation
                 property; Split-merge technique; Standard matrix types;
                 Symmetric tridiagonal matrices; Workstation cluster",
  thesaurus =    "Eigenvalues and eigenfunctions; Parallel algorithms;
                 Parallel programming; Performance evaluation;
                 Workstations",
}

@InProceedings{Trelles-Salazar:1994:MSS,
  author =       "O. Trelles-Salazar and E. L. Zapata and J.-M. Carazo",
  title =        "Mapping strategies for sequential sequence comparison
                 algorithms on {LAN-based} message passing
                 architectures",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "197--202",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Archit., Malaga Univ., Spain",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5620L (Local area
                 networks); C5630 (Networking equipment); C6110B
                 (Software engineering techniques); C6110P (Parallel
                 programming); C6150N (Distributed systems software);
                 C6160Z (Other DBMS); C7330 (Biology and medical
                 computing)",
  keywords =     "Communication latency; Dynamic load balancing; Fault
                 tolerant capabilities; File server; Guided self
                 scheduling; LAN-based message passing architectures;
                 Mapping strategies; Overall data-passing load;
                 Public-domain PVM 3.1 system; Sequential sequence
                 comparison algorithms; Simple workstation clusters;
                 Socket to socket communications; Software-integration
                 tool",
  thesaurus =    "Biology computing; Computer architecture; File
                 servers; Local area networks; Message passing; Parallel
                 programming; Resource allocation; Scheduling;
                 Sequences; Software fault tolerance; Software
                 portability; Very large databases; Workstations",
}

@InProceedings{Uhl:1994:PCC,
  author =       "A. Uhl",
  title =        "Parallel Compact Coding of Satellite Images with
                 Wavelet Packets using {PVM}",
  crossref =     "Kumar:1994:PPI",
  pages =        "382--387",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Varadarajan:1994:FDT,
  author =       "V. Varadarajan and R. Mittra",
  title =        "Finite-difference time-domain ({FDTD}) analysis using
                 distributed computing",
  journal =      j-IEEE-MICROW-GUIDED-WAVE-LETT,
  volume =       "4",
  number =       "5",
  pages =        "144--145",
  month =        sep # "\slash " # oct,
  year =         "1994",
  CODEN =        "IMGLE3",
  DOI =          "https://doi.org/10.1109/75.289515",
  ISSN =         "1051-8207 (print), 1558-2329 (electronic)",
  ISSN-L =       "1051-8207",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Electromagnetic Commun. Lab., Illinois Univ.,
                 Champaign, IL, USA",
  classification = "B0290Z (Other numerical methods); B5100 (Electric
                 and magnetic fields); B5200 (Electromagnetic waves,
                 antennas and propagation); C4190 (Other numerical
                 methods); C7310 (Mathematics); C7410D (Electronic
                 engineering)",
  fjournal =     "IEEE Microwave and Guided Wave Letters",
  keywords =     "Electromagnetics; FDTD calculations; Finite-difference
                 time-domain analysis; Linear speedup; Parallel
                 distributed computing; Parallel Virtual Machine; PVM
                 3.2; Three-dimensional rectangular cavity",
  thesaurus =    "Cavity resonators; Distributed processing;
                 Electromagnetic field theory; Electronic engineering
                 computing; Finite difference time-domain analysis;
                 Mathematics computing; Parallel algorithms",
}

@InProceedings{Vaughan:1994:MPM,
  author =       "P. L. Vaughan and A. Skjellum and D. S. Reese and
                 Fei-Chen Cheng",
  title =        "Migrating from {PVM} to {MPI}. {I}. The {Unify}
                 system",
  crossref =     "IEEE:1994:FSF",
  pages =        "488--495",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "NSF Eng. Res. Center for Computational Field
                 Simulation, Mississippi State Univ., MS, USA",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "NSF Eng. Res. Center for Computational Field
                 Simulation, Mississippi State Univ., MS, USA",
  keywords =     "evolution path; Evolution path; message passing;
                 message Passing Interface; Message Passing Interface;
                 Message passing system; MPI; parallel libraries;
                 Parallel libraries; parallel programming; portability
                 system; Portability system; PVM; software portability;
                 standard notation; Standard notation; system; Unify
                 system",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Comput. Archit.;
                 NASA; Univ. Maryland Inst. Adv. Comput. Studies; George
                 Mason Univ",
  thesaurus =    "Message passing; Parallel programming; Software
                 portability",
  treatment =    "P Practical",
}

@InProceedings{vonHanxleden:1994:VDF,
  author =       "R. von Hanxleden and K. Kennedy and J. Saltz",
  title =        "Value-based distributions in {Fortran D}",
  crossref =     "Gentzsch:1994:HPC",
  pages =        "434--440",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Rice Univ., Houston, TX, USA",
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6140D (High level languages); C6150C
                 (Compilers, interpreters and other processors)",
  keywords =     "Access locality; Access patterns; Data-parallel
                 language; Fortran D; Index-based distributions;
                 Inter-processor locality; Intra-processor locality;
                 Irregular applications; Scalability; Sequential data
                 structures; Value-based distributions",
  thesaurus =    "Data structures; FORTRAN; Parallel languages;
                 Parallelising compilers",
}

@Article{Walker:1994:DSM,
  author =       "David W. Walker",
  title =        "The design of a standard message passing interface for
                 distributed memory concurrent computers",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "4",
  pages =        "657--673",
  day =          "31",
  month =        mar,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See erratum \cite{Walker:1994:EDS}.",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=4&aid=865;
                 http://www.epm.ornl.gov/~walker/mpi/papers/parcomp94.ps.Z",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C5610N
                 (Network interfaces)",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "application topologies; Application topologies;
                 application topologies, Standard message passing
                 interface; collective communication; communication
                 contexts; Communication contexts; communication
                 contexts; communication routines; Communication
                 routines; distributed memory concurrent computers;
                 Distributed memory concurrent computers; distributed
                 memory concurrent computers; distributed memory
                 systems; message passing; MIMD; MPI; network
                 interfaces; point-to-point communication; process
                 groups; Process groups; standard message passing
                 interface; standards",
  pubcountry =   "Netherlands",
  thesaurus =    "Distributed memory systems; Message passing; Network
                 interfaces; Standards",
  treatment =    "P Practical",
}

@Article{Walker:1994:EDS,
  author =       "David W. Walker",
  title =        "Erratum to: {``The design of a standard message
                 passing interface for distributed memory concurrent
                 computers''}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "8",
  pages =        "1215--1215",
  month =        aug,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Apr 06 15:06:32 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See \cite{Walker:1994:DSM}.",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "application topologies; collective communication;
                 communication contexts; distributed memory concurrent
                 computers; message passing; point-to-point
                 communication; process groups; standards",
}

@InProceedings{Wark:1994:PIR,
  author =       "P. Wark and J. Holt",
  title =        "{PVM} Implementation of a Repeated Matching Heuristic
                 For Vehicle Routing",
  crossref =     "Arnold:1994:PCT",
  pages =        "207--216 (or 207--214??)",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput., Univ. of Southern
                 Queensland, Toowoomba, Qld., Australia",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6120 (File organisation); C7430 (Computer
                 engineering)",
  corpsource =   "Dept. of Math. and Comput., Univ. of Southern
                 Queensland, Toowoomba, Qld., Australia",
  keywords =     "Benchmark problems; benchmark problems; computational
                 complexity; MIMD parallel computer; NP-hard problem;
                 parallel algorithms; Parallel Virtual Machine; PVM
                 implementation; Repeated matching heuristic; repeated
                 matching heuristic; routing; Software package PVM;
                 software package PVM; structure; structures; SUN
                 workstations; tree; tree data; Tree structure; vehicle;
                 Vehicle routing; virtual machines",
  pubcountry =   "Netherlands",
  thesaurus =    "Computational complexity; Parallel algorithms; Tree
                 data structures; Virtual machines",
  treatment =    "A Application; P Practical",
}

@Article{Welch:1994:PVM,
  author =       "L. R. Welch",
  title =        "A Parallel Virtual Machine for Programs Composed of
                 Abstract Data Types",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "43",
  number =       "11",
  pages =        "1249--1261",
  month =        nov,
  year =         "1994",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.324558",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6120 (File organisation);
                 C6150N (Distributed systems)",
  corpsource =   "Dept. of Comput. and Inf. Sci., New Jersey Inst. of
                 Technol., Newark, NJ, USA",
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "abstract data types; ADTs; ARC; Asynchronous Remote
                 Procedure Call; automatic parameter restoration; data
                 structures; data synchronization; database management;
                 dynamic load balancing; languages; machines;
                 modularity; multiprocessing programs; parallel
                 programming; parallel virtual machine; programming;
                 remote procedure calls; reuse; software reusability;
                 system development; systems; virtual",
  treatment =    "P Practical",
}

@InProceedings{White:1994:VVC,
  author =       "R. White",
  title =        "{VCMON} --- the {VM\slash ESA Connectivity Monitor}",
  crossref =     "Anonymous:1994:PSE",
  pages =        "783--792",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Velocity Software Inc., Boston, MA, USA",
  classification = "C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  keywords =     "ADSM; AVS; Computer architecture; Connectivity
                 product; LFS; PVM; RSCS; TCP/IP; VCMON; Virtual
                 machine; VM; VM/ESA Connectivity Monitor; VTAM",
  thesaurus =    "Client-server systems; Open systems; Operating systems
                 [computers]; System monitoring; Virtual machines",
}

@PhdThesis{Wilhelms:1994:DAL,
  author =       "Gerhard Wilhelms",
  title =        "{Dynamische adaptive Lastverteilung f{\"u}r PVM
                 mittels unscharfer Benutzerprofile -- $ \mbox {PVM}^+ $
                 (English: Dynamic adaptive load distribution for PVM by
                 blurred user profiles -- $ \mbox {PVM}^+ $ ).}",
  type =         "Dissertation",
  school =       "Math.-Naturwiss. Fakult{\"a}t, Universit{\"a}t
                 Augsburg",
  address =      "Augsburg, Germany",
  pages =        "iv + 74",
  year =         "1994",
  bibdate =      "Sat Apr 06 15:01:28 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@InProceedings{Yan:1994:PTA,
  author =       "J. C. Yan",
  title =        "Performance tuning with {AIMS} --- an {Automated
                 Instrumentation and Monitoring System} for
                 multicomputers",
  crossref =     "Hesham:1994:PTS",
  pages =        "625--633",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "RECOM Technol., NASA Ames Res. Center, Moffett Field,
                 CA, USA",
  classification = "C5470 (Performance evaluation and testing); C6110P
                 (Parallel programming); C6130B (Graphics techniques);
                 C6150G (Diagnostic, testing, debugging and evaluating
                 systems); C6150N (Distributed systems)",
  keywords =     "AIMS; Automated Instrumentation and Monitoring System;
                 C programs; Data collection overhead; Event recorders;
                 FORTRAN programs; Multicomputers; Multiprocessors;
                 Parallel program execution; Parallel programming
                 paradigm; Performance data collection; Performance
                 evaluation; Performance tuning; PVM; Resource
                 allocation algorithms; Run-time performance-monitoring
                 library; Scalable multiprocessor; Software toolkit;
                 Source-code instrumentor; Trace post-processor;
                 Trace-file analysis; Trace-file animation",
  thesaurus =    "Computer animation; Computerised instrumentation;
                 Computerised monitoring; Data acquisition;
                 Multiprocessing systems; Parallel programming;
                 Performance evaluation; Resource allocation; System
                 monitoring; Tuning",
}

@Article{Yi:1994:PID,
  author =       "Sung Yi and K. H. Pierson and M. F. Ahmad",
  title =        "Parallel implementation of dynamic simulation to
                 filamentary composite structures with general rate
                 dependent damping",
  journal =      j-COMPUT-SYST-ENG,
  volume =       "5",
  number =       "4-6",
  pages =        "469--477",
  month =        aug # "--" # dec,
  year =         "1994",
  CODEN =        "COSEEO",
  ISSN =         "0956-0521",
  ISSN-L =       "0956-0521",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Nat. Center for Supercomput. Applications, Illinois
                 Univ., Urbana, IL, USA",
  classification = "C4130 (Interpolation and function approximation);
                 C4185 (Finite element analysis); C6110P (Parallel
                 programming); C7440 (Civil and mechanical engineering
                 computing)",
  fjournal =     "Computing systems in engineering: an international
                 journal",
  keywords =     "CM-5; Conjugate gradient algorithm; Dynamic
                 simulation; Dynamic viscoelastic finite element
                 algorithm; Filamentary composite structures; Generic
                 message passing library; PVM master/slave visco-elastic
                 finite element program; Rate dependent damping;
                 Scalable distributed parallel environment",
  thesaurus =    "Conjugate gradient methods; Damping; Digital
                 simulation; Finite element analysis; Message passing;
                 Parallel programming; Structural engineering computing;
                 Viscoelasticity",
}

@InProceedings{Zdetsis:1994:PMD,
  author =       "A. D. Zdetsis and R. Biswas",
  title =        "A Parallel Molecular Dynamics Strategy For {PVM}",
  crossref =     "Turchi:1994:SDA",
  pages =        "713--718",
  year =         "1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Zemla:1994:WTC,
  author =       "A. Zemla",
  title =        "Wavelet transforms computing on {PVM}",
  crossref =     "Dongarra:1994:PSC",
  pages =        "534--546",
  year =         "1994",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallel computing methods are developed especially in
                 centers that have expensive multiprocessor computers.
                 The advantage of PVM (Parallel Virtual Machine) is that
                 it permits a network of heterogeneous Unix computers to
                 be used as a single large parallel computer. Thus large
                 computational problems can be solved by using the
                 aggregate power of many computers. We present some PVM
                 computational experiments of wavelet transforms in
                 image processing. Some PVM experiments were carried out
                 on an IBM PC 486 working under the LINUX system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Math., Polish Acad. of Sci., Warsaw, Poland",
  classification = "B0230 (Integral transforms); B0290Z (Other numerical
                 methods); B6140C (Optical information, image and video
                 signal processing); C1130 (Integral transforms); C1250
                 (Pattern recognition); C4190 (Other numerical methods);
                 C5260B (Computer vision and image processing
                 techniques); C6110P (Parallel programming); C6150J
                 (Operating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Inst. of Math., Polish Acad. of Sci., Warsaw, Poland",
  keywords =     "aggregate power; Aggregate power; IBM PC; image
                 processing; Image processing; large parallel computer;
                 LINUX system; machines; methods; microcomputer
                 applications; multiprocessor computers.; Multiprocessor
                 computers.; network; network of heterogeneous Unix
                 computers; Network of heterogeneous Unix computers;
                 operating systems; Parallel computing; Parallel
                 computing methods; parallel programming; Parallel
                 Virtual Machine; PVM; single; Single large parallel
                 computer; Unix; virtual; wavelet transforms; Wavelet
                 transforms computing",
  pubcountry =   "Germany",
  sponsororg =   "Danish Comput. Centre for Res. and Educ.; Inst. Math.
                 Modelling; Tech. Univ. Denmark",
  thesaurus =    "Image processing; Microcomputer applications; Network
                 operating systems; Parallel programming; Unix; Virtual
                 machines; Wavelet transforms",
  treatment =    "P Practical",
  xxnote =       "NB: special form AT{\&T} required to get correct
                 alpha-style labels.",
}

@InProceedings{Zielinski:1994:PPS,
  author =       "K. Zielinski and M. Gajecki and G. Czajkowski",
  title =        "Parallel programming systems for {LAN} distributed
                 computing",
  crossref =     "IEEE:1994:IPN",
  pages =        "600--607",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci., Univ. of Min. and Metall.,
                 Cracow, Poland",
  classification = "B6210L (Computer communications); C5620L (Local area
                 networks); C6110P (Parallel programming); C6140D (High
                 level languages)",
  keywords =     "ANSA; Communication tests; Distributed computing
                 environments; Distributed programming systems
                 construction; Experimental results; LAN distributed
                 computing; Linda; P4; Parallel programming systems;
                 Processor farm model efficiency; PVM; Run time
                 efficiency; SR; Strand",
  thesaurus =    "Local area networks; Parallel languages; Parallel
                 programming; Software packages",
}

@InProceedings{Zu:1994:OSM,
  author =       "Hong Zu and Ya-Dong Gui and L. M. Ni",
  title =        "Optimal software multicast in wormhole-routed
                 multistage networks",
  crossref =     "IEEE:1994:PSW",
  pages =        "703--712",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inf. Sci. Inst., Univ. of Southern California, Marina
                 del Rey, CA, USA",
  classification = "C4230M (Multiprocessor interconnection); C5220P
                 (Parallel architecture); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming)",
  keywords =     "64-Node SP-1; Application level broadcast; Collective
                 communication; IBM SP-1; Interconnection architecture;
                 Meiko CS-2; Multistage interconnection networks;
                 Optimal multicast algorithm; Optimal software
                 multicast; Public domain MPI; Scalable parallel
                 computers; Switching technology; System level multicast
                 service; TMC CM-5; Wormhole routed multistage networks;
                 Wormhole-routed multistage networks",
  thesaurus =    "Multistage interconnection networks; Parallel
                 algorithms; Parallel machines",
}

@InProceedings{Almeida:1995:CST,
  author =       "F. Almeida and F. Garcia and J. Roda and D. Morales
                 and Rodriguez and C.",
  title =        "A comparative study of two distributed systems: {PVM}
                 and transputers",
  crossref =     "Cook:1995:TAS",
  pages =        "244--258",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C1160 (Combinatorial mathematics); C1180
                 (Optimisation techniques); C4240P (Parallel programming
                 and algorithm theory); C5220P (Parallel architecture);
                 C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support)",
  corpsource =   "Univ. de La Laguna, Tenerife, Spain",
  keywords =     "algorithms; branch and bound; distributed system;
                 divide and conquer methods; divide and conquer parallel
                 heapsort algorithm; dynamic programming; environment;
                 Inmos language; integer knapsack problem; LAN; load
                 balancing; operations research; parallel; parallel
                 algorithm; parallel development environment; parallel
                 virtual machine; programming environments; PVM;
                 quicksort algorithm; software; sorting; sorting
                 problem; system; systems; transputer; transputer links;
                 travelling salesman problem; travelling salesman
                 problems",
  pubcountry =   "Netherlands",
  sponsororg =   "Transputer Consortium; World occam and Transputer User
                 Group; et al",
  treatment =    "P Practical",
}

@InProceedings{Aloisio:1995:UPW,
  author =       "G. Aloisio and M. A. Bochicchio",
  title =        "The use of {PVM} with workstation clusters for
                 distributed {SAR} data processing",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "570--581",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Fac. di Ingegneria, Lecce Univ., Italy",
  classification = "B5230 (Electromagnetic compatibility and
                 interference); B6140C (Optical information, image and
                 video signal processing); B6320 (Radar equipment,
                 systems and applications); C1250 (Pattern recognition);
                 C5260B (Computer vision and image processing
                 techniques)",
  corpsource =   "Fac. di Ingegneria, Lecce Univ., Italy",
  keywords =     "active sensor; Active sensor; backscattered echo
                 signals; Backscattered echo signals; cluster of
                 workstations; Cluster of workstations; digital
                 processing; Digital processing; distributed SAR data
                 processing; Distributed SAR data processing; echo; high
                 resolution ground; High resolution ground images; IBM
                 RISC; IBM RISC System 6000/350; image focusing
                 algorithm; Image focusing algorithm; image processing;
                 images; PVM; radar; remote sensing; Remote sensing;
                 synthetic aperture; System 6000/350; workstation
                 clusters; Workstation clusters",
  pubcountry =   "Germany",
  thesaurus =    "Echo; Image processing; Remote sensing; Synthetic
                 aperture radar",
  treatment =    "A Application; P Practical",
}

@InProceedings{Alves:1995:WPC,
  author =       "A. Alves and L. Silva and J. Carreira and J. G.
                 Silva",
  title =        "{WPVM}: parallel computing for the people",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "582--587",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. de Engenharia Inf., Coimbra Univ., Portugal",
  classification = "C5440 (Multiprocessing systems); C6150J (Operating
                 systems); C6180 (User interfaces); C7430 (Computer
                 engineering)",
  keywords =     "Microsoft Windows Operating System; MS Windows;
                 Parallel machine; PC LANs; PVM implementation; Windows
                 Parallel Virtual Machine",
  thesaurus =    "Operating systems [computers]; Parallel machines; User
                 interfaces; Virtual machines",
}

@InProceedings{Ancona:1995:PAD,
  author =       "M. Ancona and M. {De Benedetto}",
  title =        "A parallel algorithm for `document segmentation'",
  crossref =     "IEEE:1995:PEW",
  pages =        "516--521",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartamento di Inf. e Scienza dell'Inf., Genoa Univ.,
                 Italy",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5260B (Computer vision and image processing
                 techniques); C6130D (Document processing techniques)",
  keywords =     "Data parallel approach; Divide and conquer
                 implementation; Document segmentation; Parallel
                 algorithm; PVM3 system; Xy-tree; Xy-trees",
  thesaurus =    "Divide and conquer methods; Document image processing;
                 Image segmentation; Parallel algorithms; Tree data
                 structures",
}

@Article{Anonymous:1995:BRPb,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{PVM: Parallel virtual
                 machine: a users' guide and tutorial for networked
                 parallel computing}}: By Al Geist, Adam Beguelin, Jack
                 Dongarra, Weicheng Jiang, Robert Manchek and Vaidy
                 Sunderam. MIT Press, Cambridge, MA. (1994). 279 pages.
                 \$19.95}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "30",
  number =       "9",
  pages =        "122--122",
  month =        nov,
  year =         "1995",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:22 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/0898122195901973",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Anonymous:1995:BRU,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{Using MPI: Portable parallel
                 programming with the message-passing interface}}: By
                 William Gropp, Ewing Lusk and Anthony Skjellum. MIT
                 Press, Cambridge, MA. (1994). 307 pages. \$24.95}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "30",
  number =       "9",
  pages =        "122--122",
  month =        nov,
  year =         "1995",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:22 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/089812219590199X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@InProceedings{Anonymous:1995:UPH,
  author =       "Anonymous",
  title =        "Using {PVM} to Host {CLIPS} in Distributed
                 Environments",
  crossref =     "Anonymous:1995:CCS",
  pages =        "203--211",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Appiani:1995:PSI,
  author =       "E. Appiani and M. Bologna and M. Corvi and M.
                 Iardella",
  title =        "{PVM} in a shared-memory industrial multiprocessor",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "588--593",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Res. and Dev. Services, Elsag Bailey, Genova, Italy",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C7430 (Computer
                 engineering)",
  keywords =     "EMMA2E; ESPRIT project; Message-passing environment;
                 Performance; Portable parallel applications; PVM;
                 Shared-memory environment; Shared-memory industrial
                 multiprocessor",
  thesaurus =    "Parallel processing; Shared memory systems; Virtual
                 machines",
}

@InProceedings{Appiani:1995:PSM,
  author =       "E. Appiani and M. Bologna and M. Corvi and M.
                 Iardella",
  title =        "{PVM} in a shared-memory industrial multiprocessor",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "588--593",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems)C5220P (Parallel
                 architecture); C7430 (Computer engineering)",
  corpsource =   "Res. and Dev. Services, Elsag Bailey, Genova, Italy",
  keywords =     "EMMA2E; ESPRIT project; message-; parallel
                 applications; parallel processing; passing environment;
                 performance; portable; PVM; shared memory systems;
                 shared-memory environment; shared-memory industrial
                 multiprocessor; virtual machines",
  pubcountry =   "Germany",
  treatment =    "A Application; P Practical",
}

@InProceedings{Arioli:1995:PSB,
  author =       "M. Arioli and A. Drummond and I. S. Duff and D. Ruiz",
  title =        "A parallel scheduler for block iterative solvers in
                 heterogeneous computing environments",
  crossref =     "Bailey:1995:PSS",
  pages =        "460--465",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Istituto di Analisi Numerica, CNR, Pavia, Italy",
  classification = "B0290F (Interpolation and function approximation);
                 C4130 (Interpolation and function approximation);
                 C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessing systems)",
  keywords =     "Block iterative solvers; Cimmino method; Communication
                 networks; Heterogeneous computing environments;
                 Heterogeneous processors; Parallel scheduler; PVM 3",
  thesaurus =    "Iterative methods; Parallel algorithms; Parallel
                 machines; Scheduling; Telecommunication networks",
}

@InProceedings{Arnow:1995:DLB,
  author =       "D. M. Arnow",
  title =        "{DP}: a library for building portable, reliable
                 distributed applications",
  crossref =     "USENIX:1995:PUT",
  pages =        "235--247",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. and Inf. Sci., Brooklyn Coll., NY,
                 USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  keywords =     "Asynchronous delivery; Communication tool; Disjunctive
                 programming; Distributed processing; Distributed
                 programming; DP; Failure tolerance; Integer goal
                 programming code; Interrupt generating message;
                 Library; Message operation; Message operations; Monte
                 Carlo; Portable software building; Process creation;
                 Process management; Reliable distributed application;
                 Software package; Software portability; Software
                 support",
  thesaurus =    "Application generators; Authoring systems; Distributed
                 processing; Software fault tolerance; Software
                 libraries; Software packages; Software portability",
}

@InProceedings{Asenjo:1995:SLF,
  author =       "R. Asenjo and E. L. Zapata",
  title =        "Sparse {LU} factorization of the {Cray T3D}",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "690--696",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Archit., Malaga Univ., Spain",
  classification = "C4140 (Linear algebra); C4240P (Parallel programming
                 and algorithm theory); C6110P (Parallel programming);
                 C6150N (Distributed systems software)",
  keywords =     "Compressed row storage; Cray T3D; Cyclic distribution;
                 Distributed memory machines; Dynamic data movement;
                 Fill-in; Local storage schemes; Overall efficiency;
                 Parallel algorithm; Processor mesh; PVM message passing
                 interface; Semi-ordered linked list; Sparse codes;
                 Sparse LU factorization; Sparse matrices; SPMD
                 programming model; Two-dimensional linked list",
  thesaurus =    "Cray computers; Distributed memory systems; Message
                 passing; Parallel algorithms; Parallel programming;
                 Sparse matrices",
}

@InProceedings{Ashby:1995:PPG,
  author =       "S. F. Ashby and R. D. Falgout and S. G. Smith and A.
                 F. B. Tompson",
  title =        "The parallel performance of a groundwater flow code on
                 the {Cray T3D}",
  crossref =     "Bailey:1995:PSS",
  pages =        "131--136",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Center for Comput. Sci. and Eng., Lawrence Livermore
                 Nat. Lab., CA, USA",
  classification = "A0260 (Numerical approximation and analysis); A0270
                 (Computational techniques); A4755M (Flow through porous
                 media); A9240K (Groundwater); C4240P (Parallel
                 programming and algorithm theory); C6110P (Parallel
                 programming); C7340 (Geophysics computing)",
  keywords =     "Computational kernels; Cray T3D; Distributed memory
                 MIMD machines; Groundwater flow code; Parallel
                 performance; PVM message-passing library;
                 Three-dimensional heterogeneous porous media",
  thesaurus =    "Flow through porous media; Geophysics computing;
                 Groundwater; Message passing; Numerical analysis;
                 Parallel programming",
}

@InProceedings{Ayguade:1995:DUA,
  author =       "E. Ayguade and J. Garcia and M. Girones and J. Labarta
                 and J. Torres and M. Valero",
  title =        "Detecting and using affinity in an automatic data
                 distribution tool",
  crossref =     "Pingali:1995:LCP",
  pages =        "61--75",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. d'Arquitectura de Computadors, Univ. Politecnica
                 de Catalunya, Barcelona, Spain",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  keywords =     "Affinity; Alignment; Alignment functions; Alignment
                 preferences; Arrays; Automatic data distribution tool;
                 Data Distribution Tool; Fortran77; Loop reference
                 patterns; Perfect Club benchmarks; Programs; Reference
                 pattern analysis; SPEC benchmarks; Static functions;
                 Tool phases",
  thesaurus =    "Parallel programming; Software tools",
}

@InProceedings{Bakhtiari:1995:APL,
  author =       "S. Bakhtiari and R. Safavi-Naini",
  title =        "Application of {PVM} to linear cryptanalysis",
  crossref =     "Gray:1995:PCT",
  pages =        "278--279",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Wollongong Univ., NSW, Australia",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6130S (Data security)",
  corpsource =   "Wollongong Univ., NSW, Australia",
  keywords =     "attack; Attack; block cipher algorithms; Block cipher
                 algorithms; cryptography; Data; Data Encryption
                 Standard; Encryption Standard; linear cryptanalysis;
                 Linear cryptanalysis; parallel algorithms; PVM; virtual
                 machines",
  pubcountry =   "Netherlands",
  thesaurus =    "Cryptography; Parallel algorithms; Virtual machines",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Barbour:1995:PIG,
  author =       "A. E. Barbour and M. F. Gabre",
  title =        "Parallel Implementation of {Gauss--Seidel} and
                 Conjugate Gradient For Solving System of Linear
                 Equations {$ A x = b $} Using {PVM}",
  crossref =     "Aityan:1995:PFI",
  pages =        "33--36",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Georgia Southern
                 Univ., Statesboro, GA, USA",
  classification = "B0290F (Interpolation and function approximation);
                 B0290H (Linear algebra); C4130 (Interpolation and
                 function approximation); C4140 (Linear algebra); C4240P
                 (Parallel programming and algorithm theory); C5440
                 (Multiprocessing systems)",
  corpsource =   "Dept. of Math. and Comput. Sci., Georgia Southern
                 Univ., Statesboro, GA, USA",
  keywords =     "algorithms; conjugate gradient methods; Conjugate
                 gradient methods; definite band matrix; equations;
                 Gauss--Seidel method; iterations; Iterations; linear;
                 Linear equations; matrix algebra; parallel; parallel
                 implementation; Parallel implementation; parallel
                 machines; positive; Positive definite band matrix; PVM;
                 solution vector; Solution vector; systematic behavior;
                 Systematic behavior",
  thesaurus =    "Conjugate gradient methods; Matrix algebra; Parallel
                 algorithms; Parallel machines",
  treatment =    "A Application; P Practical",
}

@Article{Beaumont:1995:DPG,
  author =       "P. M. Beaumont and P. T. Bradshaw",
  title =        "A distributed parallel genetic algorithm for solving
                 optimal growth models",
  journal =      j-COMP-ECONOMICS,
  volume =       "8",
  number =       "3",
  pages =        "159--179",
  month =        aug,
  year =         "1995",
  CODEN =        "CNOMEL",
  ISSN =         "0927-7099",
  ISSN-L =       "0927-7099",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Econ., Florida State Univ., Tallahassee, FL,
                 USA",
  classification = "C1180 (Optimisation techniques); C1290D (Systems
                 theory applications in economics and business); C4130
                 (Interpolation and function approximation); C4150
                 (Nonlinear and functional equations); C4180 (Integral
                 equations); C6150N (Distributed systems software);
                 C7120 (Financial computing); C7310 (Mathematics
                 computing)",
  fjournal =     "Computational Economics",
  keywords =     "Agent discounted utility maximization; Chebyshev
                 polynomial series expansion; Competing nodes;
                 Distributed parallel genetic algorithm; Economic
                 growth; Exact Euler equation; Finite horizon;
                 First-order conditions; Function topology;
                 Generalization; Infinite horizon; Multiple state
                 problems; Nonlinear integral equation; Optimal function
                 fitting; Parameter space searching; PVM; Single-state
                 deterministic optimal growth model; State variable
                 range; State-space searching; Taylor-Uhlig problem;
                 Workstation cluster",
  pubcountry =   "Netherlands",
  thesaurus =    "Chebyshev approximation; Distributed algorithms;
                 Economic cybernetics; Financial data processing;
                 Genetic algorithms; Integral equations; Mathematics
                 computing; Nonlinear equations; Polynomials;
                 State-space methods",
}

@Article{Beguelin:1995:REP,
  author =       "Adam Beguelin and Jack Dongarra and Al Geist and
                 Robert Manchek and Vaidy Sunderam",
  title =        "Recent Enhancements to {PVM}",
  journal =      j-IJSAHPC,
  volume =       "9",
  number =       "2",
  pages =        "108--127",
  month =        "Summer",
  year =         "1995",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; UnCover
                 library database",
  abstract =     "This paper presents new features of PVM, a popular
                 standard for writing parallel programs that execute
                 over networks of heterogeneous machines. Although PVM
                 has become an important infrastructure for parallel
                 programmers, we continue to develop the system based
                 both on user feedback and our own research interests.
                 In this paper we present new communications routines
                 and briefly characterize their performance. We describe
                 new extensible services that allow advanced users to
                 customize certain aspects of the default PVM
                 functionality. An overview of shared-memory PVM
                 optimizations is presented. PVM's new tracing facility
                 and a graphical console that utilizes this capability
                 are described. Finally, we discuss future extensions to
                 PVM now under investigation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Carnegie Mellon Univ",
  affiliationaddress = "Pittsburgh, PA, USA",
  classification = "722.1; 722.2; 722.4; 723.1; 723.1.1; 921.5; C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  corpsource =   "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
  journalabr =   "Int J Supercomput Appl High Perform Comput",
  keywords =     "Advanced users; advanced users; communications;
                 Communications routines; Computer architecture;
                 Computer networks; Computer programming languages;
                 Computer software; console; Data communication systems;
                 Data storage equipment; evaluation; Extensible
                 services; extensible services; graphical; Graphical
                 console; Heterogeneous machines; heterogeneous
                 machines; Message passing; Optimization; parallel
                 machines; Parallel processing systems; Parallel
                 programmers; parallel programmers; parallel
                 programming; Parallel programs; parallel programs;
                 Parallel virtual machine; Parallel Virtual Machine;
                 Performance; performance; PVM; routines; Shared memory;
                 shared memory; Shared-memory PVM optimizations;
                 shared-memory PVM optimizations; software libraries;
                 software performance; software standards; Standard;
                 standard; systems; Tracing facility; tracing facility;
                 User feedback; user feedback; virtual machines",
  thesaurus =    "Parallel machines; Parallel programming; Shared memory
                 systems; Software libraries; Software performance
                 evaluation; Software standards; Virtual machines",
  treatment =    "A Application; P Practical",
}

@InProceedings{Bendrider:1995:SME,
  author =       "M. Bendrider and J.-M. Leclercq",
  title =        "Second-Order {M{\o}ller--Plesset} and {Epstein-Nesbet}
                 Corrections to the Molecular Charge Density:
                 Distributed Computing on a Cluster of Heterogeneous
                 Workstations with the {PVM} System",
  crossref =     "Bernardi:1995:CCE",
  pages =        "73--??",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Benkner:1995:VFA,
  author =       "S. Benkner",
  title =        "{Vienna Fortran 90} --- an advanced data parallel
                 language",
  crossref =     "Malyshkin:1995:PCT",
  pages =        "142--156",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. for Software Technol., Wien Univ., Austria",
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6140D (High level languages)",
  keywords =     "Advanced data parallel language; Data distribution;
                 Distributed memory parallel computers; Explicit user
                 control; Pointer objects; Shared memory programming
                 paradigm; User defined data structures; Vienna Fortran
                 90",
  thesaurus =    "FORTRAN; Message passing; Parallel languages; Shared
                 memory systems; Storage management",
}

@Article{Berendsen:1995:GMP,
  author =       "H. J. C. Berendsen and D. van der Spoel and R. van
                 Drunen",
  title =        "{GROMACS}: a message-passing parallel molecular
                 dynamics implementation",
  journal =      j-COMP-PHYS-COMM,
  volume =       "91",
  number =       "1-3",
  pages =        "43--56",
  month =        sep,
  year =         "1995",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Bioson Res. Inst., Groningen Univ., Netherlands",
  classification = "A3115 (General mathematical and computational
                 developments for atoms and molecules); A3420
                 (Interatomic and intermolecular potentials and forces);
                 A3425 (Intramolecular energy transfer; A3520D
                 (Interatomic distances and angles in molecules); A6120J
                 (Computer simulation of static and dynamic liquid
                 behaviour); A8710 (General, theoretical, and
                 mathematical biophysics); A8715 (Molecular biophysics);
                 C6110P (Parallel programming); C7320 (Physics and
                 chemistry computing); C7330 (Biology and medical
                 computing); dynamics of van der Waals molecules);
                 intramolecular dynamics",
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
  keywords =     "Analysis tools; Aqueous environment;
                 Biomacromolecules; Buckingham potentials; Charge
                 groups; Conversion programs; Cosine power series
                 interactions; Coulomb potentials; Custom-designed
                 32-processor ring GROMACS; Dihedral angles; Energy
                 minimization program; Fixed bonded interactions;
                 GROMACS software; GROningen MAchine for Chemical
                 Simulation; Interprocessor communication; Lennard-Jones
                 potentials; Message-passing parallel molecular dynamics
                 implementation; Molecular dynamics program; Parallel
                 message-passing implementation; Parallel system;
                 Particle decomposition; Pressure scaling; Rectangular
                 periodic boundary conditions; Temperature scaling;
                 Variable nonbonded pair interactions",
  pubcountry =   "Netherlands",
  thesaurus =    "Biology computing; Bond angles; Chemistry computing;
                 Digital simulation; Electric potential; Lennard-Jones
                 potential; Molecular biophysics; Molecular dynamics
                 method; Parallel programming",
}

@Article{Bernaschi:1995:DRP,
  author =       "Massimo Bernaschi and Giorgio Richelli",
  title =        "Development and results of {PVMe} on the {IBM 9076
                 SP1}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "29",
  number =       "1",
  pages =        "75--83",
  day =          "15",
  month =        aug,
  year =         "1995",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1995.1107",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:58 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1995.1107/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1995.1107/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6115 (Programming
                 support)",
  corpsource =   "IBM Eur. Center for Sci. and Eng. Comput., Rome,
                 Italy",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "IBM 9076 SP1; IBM's AIX implementation; message
                 passing; parallel machines; PARMACS; passing
                 programming model; programming environments; PVM
                 message; PVMe",
  treatment =    "A Application; P Practical",
}

@InProceedings{Bernaschi:1995:PEI,
  author =       "M. Bernaschi and G. Richelli",
  title =        "{PVMe}: an enhanced implementation of {PVM} for the
                 {IBM 9076 SP2}",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "461--471",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM Eur. Center for Sci. and Eng. Comput., Rome,
                 Italy",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "IBM Eur. Center for Sci. and Eng. Comput., Rome,
                 Italy",
  keywords =     "IBM 9076 SP2; IBM's AIX implementation; message;
                 message passing; Message passing programming model;
                 parallel programming; passing programming model; PVMe",
  pubcountry =   "Germany",
  thesaurus =    "Message passing; Parallel programming",
  treatment =    "P Practical",
}

@InProceedings{Bickham:1995:POM,
  author =       "J. L. Bickham",
  title =        "Parallel ocean modeling using {Glenda}",
  crossref =     "ACM:1995:PAS",
  pages =        "58--63",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ. of Southern Mississippi, Hattiesburg, MS, USA",
  classification = "C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C7340
                 (Geophysics computing)",
  keywords =     "Array dependency; Debugging methods; Glenda; Ocean
                 dynamics; Parallel ocean modeling; Parallel version;
                 Parallelization process; PVM; SWEM",
  thesaurus =    "Geophysics computing; Oceanographic techniques;
                 Parallel programming; Program debugging",
}

@InProceedings{Bischof:1995:CSM,
  author =       "C. Bischof and S. Huss-Lederman and Xiaobai Sun and A.
                 Tsao and T. Turnbull",
  title =        "A Case Study of {MPI}: Portable and Efficient
                 Libraries",
  crossref =     "Bailey:1995:PSS",
  pages =        "728--733",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings of the Seventh SIAM Conference on Parallel
                 Processing for Scientific Computing",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "1 dimensional broadcast; 1 Dimensional broadcast;
                 ANL/MS MPI implementation; Argonne National
                 Laboratory/Mississippi State; broadcasting; case study;
                 Case study; CM5; Delta; efficient libraries; Efficient
                 libraries; Intel Delta; message passing; Message
                 Passing Interface standard; MPI; MPI based
                 implementations; MPI broadcast collective operation;
                 native NX message passing systems; Native NX message
                 passing systems; optimized versions; Optimized
                 versions; Paragon; parallel programming; parallel
                 programming system; Parallel programming system;
                 portable public domain version; Portable public domain
                 version; software libraries; software portability;
                 software standards; SP1",
  thesaurus =    "Broadcasting; Message passing; Parallel programming;
                 Software libraries; Software portability; Software
                 standards",
  treatment =    "P Practical",
}

@InProceedings{Bjorge:1995:ISS,
  author =       "D. Bjorge",
  title =        "Implementation of the semi-implicit scheme in a
                 message passing version of {HIRLAM} (weather
                 forecasting)",
  crossref =     "Hoffmann:1995:CAP",
  pages =        "75--90",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Norwegian Meteorol. Inst., Oslo, Norway",
  classification = "A9260X (Weather analysis and prediction); C4185
                 (Finite element analysis); C6110P (Parallel
                 programming); C7340 (Geophysics computing)",
  keywords =     "Algorithms; Atmosphere; Cray T3D SHMEM; DNMI; HIRLAM;
                 Intel NX; Message passing; Meteorology; MPP; Numerical
                 model; Parallel iterative Helmholtz solver; Parallel
                 programming; PVM; Semi-implicit scheme; Semiimplicit
                 scheme; Time integration scheme; Weather forecasting",
  thesaurus =    "Digital simulation; Finite element analysis; Iterative
                 methods; Message passing; Numerical analysis; Parallel
                 processing; Parallel programming; Weather forecasting",
}

@InProceedings{Blaszczyk:1995:PCE,
  author =       "A. Blaszczyk and Z. Andjelic and P. Levin and A.
                 Ustundag",
  title =        "Parallel computation of electric fields in a
                 heterogeneous workstation cluster",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "606--611",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Corp. Res., Asea Brown Boveri AG, Heidelberg,
                 Germany",
  classification = "C5440 (Multiprocessing systems); C7310 (Mathematics
                 computing); C7320 (Physics and chemistry computing);
                 C7430 (Computer engineering)",
  keywords =     "3D boundary element code; Benchmark problems; Design
                 process; Dynamic load balancing; Electric fields;
                 Heterogeneous workstation cluster; Parallel
                 computation; PVM communication software",
  thesaurus =    "Electric fields; Mathematics computing; Parallel
                 processing; Physics computing; Virtual machines",
}

@InProceedings{Boianov:1995:DLC,
  author =       "L. Boianov and I. Jelly",
  title =        "Distributed logic circuit simulation on a network of
                 workstations",
  crossref =     "IEEE:1995:PEW",
  pages =        "304--310",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. for Distributed Syst. and Comput. Networks, Acad.
                 of Sci., Sofia, Bulgaria",
  classification = "B1130B (Computer-aided circuit analysis and design);
                 C5210B (Computer-aided logic design); C6150N
                 (Distributed systems software); C7410D (Electronic
                 engineering computing)",
  keywords =     "Digital circuits; Distributed digital logic
                 simulation; Logic circuit simulation; Logical
                 simulation algorithms; Parallel Virtual Machine",
  thesaurus =    "Digital simulation; Distributed processing; Logic
                 CAD",
}

@InProceedings{Boryczko:1995:NIC,
  author =       "I. Boryczko and J. Kitowski and J. Moscinski and A.
                 Leszczynski",
  title =        "Numerically intensive computing as a benchmark for
                 parallel computer architectures",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "118--123",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci., Cracow, Poland",
  classification = "C4100 (Numerical analysis); C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C5620L (Local
                 area networks); C5670 (Network performance); C7320
                 (Physics and chemistry computing)",
  keywords =     "Computer network; Execution time; Multiprocessors;
                 Numerically intensive computing; Parallel
                 architectures; Parallel computer architecture
                 benchmark; PVM environment; Vector supercomputers",
  thesaurus =    "Local area networks; Molecular dynamics method;
                 Multiprocessing systems; Parallel architectures;
                 Performance evaluation; Physics computing; Vector
                 processor systems",
}

@InProceedings{Branca:1995:CBH,
  author =       "A. Branca and M. Ianigro and A. Distante",
  title =        "A comparison between {HPF} and {PVM} for data parallel
                 algorithms on a cluster of workstations using a high
                 speed network",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "930--931",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Briscolini:1995:PID,
  author =       "M. Briscolini",
  title =        "A parallel implementation of a {3-D} pseudospectral
                 based code on the {IBM 9076} scalable {POWER} parallel
                 system",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "21",
  number =       "11",
  pages =        "1849--1862",
  day =          "29",
  month =        nov,
  year =         "1995",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Apr 14 12:05:41 MDT 1997",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1995&volume=21&issue=11;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1995&volume=21&issue=11&aid=1027",
  acknowledgement = ack-nhfb,
  classification = "A4725 (Turbulent flows, convection, and heat
                 transfer); B0290Z (Other numerical methods); C4190
                 (Other numerical methods); C5440 (Multiprocessing
                 systems); C7310 (Mathematics computing); C7320 (Physics
                 and chemistry computing)",
  corpsource =   "IBM ECSEC, Eur. Center for Sci. and Eng. Comput.,
                 Roma, Italy",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "3-D; 3-D pseudospectral based code; 9076 scalable
                 POWERparallel system; architecture; computational
                 kernels; computing; D FFTs; fast Fourier transforms;
                 high intensive numerical simulations; homogeneous
                 turbulent flows; IBM; implementations; mathematics
                 computing; message; message passing; MPL; numerical
                 analysis; parallel 3-; parallel distributed memory;
                 parallel implementation; parallel interfaces; parallel
                 processing; passing; physics; PVMe; turbulence",
  treatment =    "A Application; P Practical",
}

@TechReport{Bruck:1995:EMPa,
  author =       "Jehoshua Bruck",
  title =        "Efficient message passing interface ({MPI}) for
                 parallel computing on clusters of workstations",
  type =         "Research report",
  number =       "RJ 9925 (87305)",
  institution =  inst-IBM-WATSON,
  address =      inst-IBM-WATSON:adr,
  pages =        "31",
  year =         "1995",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallel computing on clusters of workstations and
                 personal computers has very high potential, since it
                 leverages existing hardware and software. Parallel
                 programming environments offer the user a convenient
                 way to express parallel computation and communication.
                 In fact, recently, a Message Passing Interface (MPI)
                 has been proposed as an industrial standard for writing
                 `portable' message-passing parallel programs. The
                 communication part of MPI consists of the usual
                 point-to-point communication as well as collective
                 communication. However, existing implementations of
                 programming environments for clusters are built on top
                 of a point-to-point communication layer (send and
                 receive) over local area networks (LANs) and, as a
                 result, suffer from poor performance in the collective
                 communication part. In this paper, we present an
                 efficient design and implementation of the collective
                 communication part in MPI that is optimized for
                 clusters of workstations. Our system consists of two
                 main components: the MPI-CCL layer that includes the
                 collective communication functionality of MPI and a
                 User-level Reliable Transport Protocol (URTP) that
                 interfaces with the LAN Data-link layer and leverages
                 the fact that the LAN is a broadcast medium. Our system
                 is integrated with the operating system via an
                 efficient kernel extension mechanism that we developed.
                 The kernel extension significantly improves the
                 performance of our implementation as it can handle part
                 of the communication overhead without involving user
                 space. We have implemented our system on a collection
                 of IBM RS/6000 workstations connected via a 10Mbit
                 Ethernet LAN. Our performance measurements are taken
                 from real scientific applications that runin a parallel
                 mode by means of the MPI. The hypothesis behind our
                 design is that system's performance will be bounded by
                 interactions between the kernel and user space rather
                 than by the bandwidth delivered by the LAN Data-Link
                 Layer. Our results indicate that the performance of our
                 MPI Broadcast (on top of Ethernet) is about twice as
                 fast as a recently published software implementation of
                 broadcast on top of ATM.",
  acknowledgement = ack-nhfb,
  annote =       "December 13, 1995.",
  institutes =   "IBM Research Division",
  keywords =     "Computer interfaces",
}

@InProceedings{Bruck:1995:EMPb,
  author =       "Jehoshua Bruck and Danny Dolev and Ching-Tien Ho and
                 Marcel-Catalin Rosu and Ray Strong",
  title =        "Efficient {Message Passing Interface} ({MPI}) for
                 Parallel Computing on Clusters of Workstations",
  crossref =     "ACM:1995:SAA",
  pages =        "64--73",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "An efficient design and implementation of the
                 collective communication part in a Message Passing
                 Interface (MPI) that is optimized for clusters of
                 workstations is described. The system which consist of
                 two main components, the MPI-CCL layer and a User-level
                 Reliable Transport Protocol (URTP), is integrated with
                 the operating system via an efficient kernel extension
                 mechanism. The system is then implemented on a
                 collection of IBM RS\slash 6000 workstations connected
                 via a 10Mbit Ethernet LAN. Results indicate that the
                 performance of the MPI Broadcast (on top of Ethernet)
                 is about twice as fast as a recently published software
                 implementation of broadcast on top of ATM.",
  acknowledgement = ack-nhfb,
  affiliation =  "California Inst of Technology",
  affiliationaddress = "Pasadena, CA, USA",
  classification = "716.1; 722.2; 722.3; 722.4; 723.1; C5470
                 (Performance evaluation and testing); C5610N (Network
                 interfaces); C5620L (Local area networks); C5640
                 (Protocols); C5670 (Network performance); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6150N (Distributed systems software)",
  conference =   "Proceedings of the 7th Annual ACM Symposium on
                 Parallel Algorithms and Architectures, SPAA'95",
  conftitle =    "Proceedings of Seventh Annual ACM Symposium on
                 Parallel Algorithms and Architectures",
  corpsource =   "California Inst. of Technol., Pasadena, CA, USA",
  journalabr =   "Annu ACM Symp Parallel Algorithms Archit",
  keywords =     "10 Mbit/s; application program interfaces; broadcast
                 medium; Broadcast medium; collective communication;
                 Collective communication; Communication overhead;
                 communication overhead; Communication overhead;
                 Computer operating systems; Computer software
                 portability; Computer systems programming; Computer
                 workstations; Data communication systems; Ethernet;
                 Ethernet LAN; IBM RS/6000 workstations; industrial
                 standard; Industrial standard; Interfaces (computer);
                 Kernel extension mechanism; kernel extension mechanism;
                 Kernel extension mechanism; LAN Data link-layer; LAN
                 data-link layer; Local area networks; local area
                 networks; message passing; Message passing interface;
                 message passing interface; Message passing interface;
                 MPI CCL layer; MPI-CCL layer; network interfaces;
                 Network protocols; operating system; Operating system;
                 parallel computing; Parallel computing; Parallel
                 processing systems; Parallel programming; parallel
                 programming; Performance; performance; Performance;
                 performance evaluation; Personal computers;
                 Point-to-point communication; point-to-point
                 communication; Point-to-point communication; portable
                 message-passing parallel programs; Portable
                 message-passing parallel programs; Program processors;
                 programming environments; Programming environments;
                 scientific programs; Scientific programs; software
                 libraries; software portability; Systems analysis;
                 transport protocols; URTP; user space; User space;
                 User-level reliable transport protocol; user-level
                 reliable transport protocol; User-level reliable
                 transport protocol; workstation clusters; Workstation
                 clusters; workstations",
  meetingaddress = "Santa Barbara, CA, USA",
  meetingdate =  "Jul 17--19 1995",
  meetingdate2 = "07/17--19/95",
  numericalindex = "Bit rate 1.0E+07 bit/s",
  sponsor =      "ACM SIGACT; ACM SIGARCH; EATCS",
  sponsororg =   "ACM; EATCS",
  thesaurus =    "Application program interfaces; Local area networks;
                 Message passing; Network interfaces; Parallel
                 programming; Performance evaluation; Programming
                 environments; Software libraries; Software portability;
                 Transport protocols; Workstations",
  treatment =    "P Practical",
}

@InProceedings{Bubeck:1995:DSC,
  author =       "T. Bubeck and M. Hiller and W. Kuchlin and W.
                 Rosenstiel",
  title =        "Distributed symbolic computation with {DTS}",
  crossref =     "Ferreira:1995:PAI",
  pages =        "231--248",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
                 Germany",
  classification = "C4130 (Interpolation and function approximation);
                 C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6130S (Data security); C6150N (Distributed
                 systems software)",
  keywords =     "Anonymous compute servers; Asynchronous RPC
                 abstraction; C threads interface; Cryptosystem;
                 Distributed symbolic computation; Distributed threads
                 system; DTS; Fork/join parallel programming; Highly
                 data-dependent algorithm parallelisation; Irregular
                 algorithm parallelisation; Multiprocessor workstation;
                 Multithreading; Parallel long integer multiplication;
                 Parallel multi-variate polynomial resultant
                 computation; Performance results; Programming
                 environment; PVM; Shared memory threads",
  thesaurus =    "Arithmetic; Cryptography; Distributed memory systems;
                 Multiprocessing programs; Multiprocessing systems;
                 Parallel algorithms; Parallel programming; Polynomials;
                 Programming environments; Remote procedure calls;
                 Shared memory systems; Software performance evaluation;
                 Symbol manipulation; Workstations",
}

@Article{Bunge:1995:MCM,
  author =       "Hans-Peter Bunge and John R. Baumgardner",
  title =        "Mantle convection modeling on parallel virtual
                 machines",
  journal =      j-COMPUT-PHYS,
  volume =       "9",
  number =       "2",
  pages =        "207--??",
  month =        mar,
  year =         "1995",
  CODEN =        "CPHYE2",
  DOI =          "https://doi.org/10.1063/1.168525",
  ISSN =         "0894-1866 (print), 1558-4208 (electronic)",
  ISSN-L =       "0894-1866",
  bibdate =      "Wed Apr 10 08:45:53 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computphys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://aip.scitation.org/doi/10.1063/1.168525",
  acknowledgement = ack-nhfb,
  ajournal =     "Comput. Phys",
  fjournal =     "Computers in Physics",
  journal-URL =  "https://aip.scitation.org/journal/cip",
}

@InProceedings{Carreira:1995:DEL,
  author =       "J. Carreira and L. Silva and J. G. Silva",
  title =        "On the design of {Eilean}: a {Linda-like} library for
                 {MPI}",
  crossref =     "IEEE:1995:PSP",
  pages =        "175--184",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Coimbra Univ., Portugal",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6140D (High level languages)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Coimbra Univ., Portugal",
  keywords =     "access policies; Access policies; communication
                 system; Communication system; distribution policies;
                 Distribution policies; Eilean; hierarchical
                 distribution; Hierarchical distribution; hierarchical
                 partitioning scheme; Hierarchical partitioning scheme;
                 Linda; Linda-like library; message passing; message
                 passing standard; Message passing standard; MPI;
                 parallel languages; parallel library; Parallel library;
                 parallel programming; programming paradigm; Programming
                 paradigm; run-time system; Run-time system; software
                 libraries; software library; Software library; software
                 portability; Software portability; tuple mapping task;
                 Tuple mapping task; tuple space; Tuple space",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Message passing; Parallel languages; Parallel
                 programming; Software libraries; Software portability",
  treatment =    "P Practical",
}

@TechReport{Casanova:1995:PPM,
  author =       "Henri Casanova and Jack Dongarra and Weicheng Jiang",
  title =        "The Performance of {PVM} on {MPP} Systems",
  type =         "Technical report",
  institution =  inst-UTK,
  address =      inst-UTK:adr,
  month =        aug,
  year =         "1995",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/utk/papers/pvmmpp.ps;
                 http://www.netlib.org/utk/papers/pvmmpp/pvmmpp.html;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/pvmmpp.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Casas:1995:MMT,
  author =       "Jeremy Casas and Dan L. Clark and Ravi Konuru and
                 Steve W. Otto and Robert M. Prouty and Jonathan
                 Walpole",
  title =        "{MPVM}: a Migration Transparent Version of {PVM}",
  journal =      j-COMP-SYS,
  volume =       "8",
  number =       "2",
  pages =        "171--216",
  month =        "Spring",
  year =         "1995",
  CODEN =        "CMSYE2",
  ISSN =         "0895-6340",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Oregon Graduate Inst. of Sci. and Technol., Beaverton,
                 OR, USA",
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software); C7430 (Computer engineering)",
  corpsource =   "Oregon Graduate Inst. of Sci. and Technol., Beaverton,
                 OR, USA",
  fjournal =     "Computing Systems",
  keywords =     "Dynamic process migration; dynamic process migration;
                 general-; General-purpose workstation environments;
                 Idle-cycles; idle-cycles; message passing; message-;
                 Message-passing parallel machine; Migratable PVM;
                 Migration-transparent version; migration-transparent
                 version; MPVM; Off-loading; off-loading; Parallel
                 computations; parallel computations; parallel machines;
                 parallel programming; Parallel Virtual Machine; passing
                 parallel machine; Performance; performance; purpose
                 workstation environments; software performance
                 evaluation; Unix; UNIX-based computers; virtual
                 machines; workstations",
  thesaurus =    "Message passing; Parallel machines; Parallel
                 programming; Software performance evaluation; Unix;
                 Virtual machines; Workstations",
  treatment =    "P Practical",
}

@InProceedings{Cavender:1995:APN,
  author =       "M. E. Cavender and Xiaodong Zhang",
  title =        "Asynchronous {PVM} Network Computing",
  crossref =     "Bailey:1995:PSS",
  pages =        "772--773",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "High Performance Comput. and Software Lab., Texas
                 Univ., San Antonio, TX, USA",
  classification = "C5620L (Local area networks); C5640 (Protocols);
                 C6150N (Distributed systems software)",
  corpsource =   "High Performance Comput. and Software Lab., Texas
                 Univ., San Antonio, TX, USA",
  keywords =     "asynchronous PVM network computing; Asynchronous PVM
                 network computing; blocking; Blocking; incoming message
                 buffer; Incoming message buffer; interrupt; Interrupt;
                 local area networks; message passing; Message passing;
                 operation; performance penalty; Performance penalty;
                 processors; Processors; program; Program; protocols;
                 PVM daemon; receiver; Receiver; stop and; Stop and wait
                 protocol; synchronized; Synchronized operation; user
                 program; User program; wait protocol",
  thesaurus =    "Local area networks; Message passing; Protocols",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Cavender:1995:SSA,
  author =       "Mark E. Cavender and Xiaodong Zhang",
  title =        "Software support for asynchronous computing across
                 networks",
  crossref =     "IEEE:1995:PNA",
  pages =        "376--382",
  year =         "1995",
  CODEN =        "PSICD2",
  ISSN =         "0730-6512",
  bibdate =      "Fri May 24 09:58:00 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95CB35838.",
  abstract =     "This paper describes the design and implementation of
                 asynchronous communication library routines for
                 distributed computing across networks of workstations.
                 The new system is based on modifications of the
                 existing PVM message-passing environment. An intensive
                 and comparative study of synchronous, asynchronous and
                 non-blocking communication protocols is addressed in
                 terms of their design, implementation and applications.
                 Experimental performance comparisons of an application
                 program using the three communication protocols on a
                 network of workstations, are also presented. The
                 experimental results show the power of the asynchronous
                 communication library and the effective enhancements of
                 the PVM message-passing environment.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Texas at San Antonio",
  affiliationaddress = "San Antonio, TX, USA",
  classification = "722.2; 722.4; 723.1; 723.5; C5620L (Local area
                 networks); C5640 (Protocols); C6110B (Software
                 engineering techniques); C6115 (Programming support);
                 C6150N (Distributed systems software)",
  conference =   "Proceedings of the 19th Annual International Computer
                 Software and Applications Conference COMPSAC '95",
  journalabr =   "Proc IEEE Comput Soc Int Comput Software Appl Conf",
  keywords =     "Application program; Asynchronous communication
                 library routine design; Asynchronous communication
                 library routines; Asynchronous communication protocols;
                 Asynchronous computing; Computer aided software
                 engineering; Computer networks; Computer workstations;
                 Data communication systems; Distributed computer
                 systems; Distributed computing; Modified PVM
                 message-passing environment; Network protocols;
                 Nonblocking communication protocols; Parallel virtual
                 machine (PVM) message passing environment; Performance
                 comparisons; Software support; Synchronous
                 communication protocols; Workstation network",
  meetingaddress = "Dallas, TX, USA",
  meetingdate =  "Aug 9--11 1995",
  meetingdate2 = "08/09--11/95",
  sponsor =      "IEEE",
  thesaurus =    "Local area networks; Message passing; Network
                 operating systems; Operating systems [computers];
                 Protocols; Software libraries; Software performance
                 evaluation; Workstations",
}

@InProceedings{Chamaret:1995:PFE,
  author =       "B. Chamaret and H. Cherefi and S. Ubeda",
  title =        "Parallel filter estimation maximisation algorithm for
                 segmentation on a {LAN} of workstation",
  crossref =     "Bailey:1995:PSS",
  pages =        "68--69",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "TSI Lab., Univ. Jean-Monnet, Saint-Etienne, France",
  classification = "B6140C (Optical information, image and video signal
                 processing); B6210L (Computer communications); C1250
                 (Pattern recognition); C4240P (Parallel programming and
                 algorithm theory); C5260B (Computer vision and image
                 processing techniques); C5620L (Local area networks)",
  keywords =     "Bayesian segmentation algorithm; Grey level images;
                 Image segmentation; LAN of workstation; Parallel filter
                 estimation maximisation algorithm; Parallel Virtual
                 Machine package; Portable parallel application",
  thesaurus =    "Bayes methods; Image segmentation; Local area
                 networks; Parallel algorithms",
}

@InProceedings{Chang:1995:EPCa,
  author =       "S.-L. Chang and D. H. C. Du and J. Hsieh and M. Lin",
  title =        "Enhanced {PVM} Communications Over a High-Speed Local
                 Area Network",
  crossref =     "Alnuweiri:1995:PHF",
  pages =        "37--46",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Chang:1995:EPCb,
  author =       "Sheue-Ling Chang and David Hung-Chang Du and Jenwei
                 Hsieh and Rose P. Tsang and Mengjou Lin",
  title =        "Enhanced {PVM} Communications over a {High-Speed
                 LAN}",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "3",
  number =       "3",
  pages =        "20--32",
  month =        "Fall",
  year =         "1995",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/M-PDT.1995.414841",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Performance results of PVM over a local ATM network
                 show the availability of much greater communication
                 bandwidth over traditional LANs such as Ethernet.
                 Realizing the full potential of high-speed networks,
                 therefore, will require further improvements in both
                 hardware and software components of network I/O
                 subsystems.",
  abstract2 =    "Enhanced Parallel Virtual Machine (PVM) communications
                 over a high speed local area network is described.
                 Performance results of PVM over a local asynchronous
                 transfer mode (ATM) show the availability of much
                 greater communication bandwidth over traditional LANs.
                 Application-level performance, however, still lags far
                 behind the capabilities of the physical medium.
                 Realizing the full potential of high-speed networks,
                 therefore, will require further improvements in both
                 hardware and software components of network input\slash
                 output subsystems.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Univ. of Minnesota",
  affiliationaddress = "Minneapolis, MN, USA",
  classification = "716; 722.2; 722.3; 722.4; 723; 731; B6210L (Computer
                 communications); B6230 (Switching centres and
                 equipment); C5620L (Local area networks); C5670
                 (Network performance)",
  corpsource =   "Dept. of Comput. Sci., Minnesota Univ., Minneapolis,
                 MN, USA",
  fjournal =     "IEEE parallel and distributed technology: systems and
                 applications",
  journalabr =   "IEEE Parallel Distrib Technol",
  keywords =     "application-level performance; Application-level
                 performance; application-level performance;
                 Asynchronous transfer mode; asynchronous transfer mode;
                 Communication bandwidth; communication bandwidth;
                 Computer architecture; Data communication systems;
                 evaluation; Fiber distributed data interface; high-;
                 high-speed LAN; High-speed LAN; high-speed LAN;
                 High-speed networks; Interfaces (computer); Local area
                 networks; local area networks; Local area networks;
                 local area networks; local ATM network; Local ATM
                 network; local ATM network; Multicasting measurements;
                 Network I/O subsystems; network I/O subsystems;
                 Parallel processing systems; Parallel virtual machine
                 (PVM); Parallel virtual machine (pvm); Performance;
                 performance; Performance; performance; PVM
                 communications; speed networks; Systems analysis;
                 systems analysis",
  thesaurus =    "Asynchronous transfer mode; Local area networks;
                 Performance evaluation",
  treatment =    "A Application; P Practical",
}

@InProceedings{Chapple:1995:PUL,
  author =       "S. R. Chapple and L. J. Clarke",
  title =        "The {Parallel Utilities Library}",
  crossref =     "IEEE:1995:PSP",
  pages =        "21--30",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Parallel Comput. Center, Edinburgh Univ., UK",
  classification = "C5440 (Multiprocessing systems); C6110B (Software
                 engineering techniques); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C7480
                 (Production engineering computing)",
  keywords =     "AEA Technology; Domain decomposition; Industrial
                 applications; Library modules; Message passing;
                 Message-Passing Interface; MPI; Parallel scalable I/O;
                 Parallel systems; Parallel Utilities Library; PUL;
                 Rolls-Royce; Shell UK; Task parallelism; Unstructured
                 mesh applications",
  thesaurus =    "Industries; Message passing; Parallel programming;
                 Software libraries; Software portability; Software
                 reusability; Subroutines",
}

@InProceedings{Clematis:1995:PPH,
  author =       "A. Clematis and B. Falcidieno and D. F. Prieto and M.
                 Spagnuolo",
  title =        "Parallel processing on heterogeneous networks for
                 {GIS} applications",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "67--72",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IMA-CNR, Genoa, Italy",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6150N (Distributed systems
                 software); C7840 (Geography and cartography
                 computing)",
  keywords =     "Applications parallelization; Geographic information
                 systems; Heterogeneous networks; Linda; Network-based
                 parallel computing; Parallel program development;
                 Performance; Portable communication libraries; PVM;
                 Software portability; Software reusability",
  thesaurus =    "Geographic information systems; Parallel processing;
                 Software libraries; Software portability; Software
                 reusability",
}

@InProceedings{Clemencon:1995:AEP,
  author =       "C. Clemencon and A. Endo and J. Fritscher and A.
                 Muller and R. Ruhl and B. J. N. Wylie",
  title =        "The 'Annai' environment for portable distributed
                 parallel programming",
  crossref =     "El-Rewini:1995:PTE",
  pages =        "242--251 (vol. 2)",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Swiss Federal Inst. of Technol., Zurich, Switzerland",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6150C (Compilers, interpreters and other processors);
                 C6150G (Diagnostic, testing, debugging and evaluating
                 systems); C6150N (Distributed systems software); C6180
                 (User interfaces)",
  keywords =     "Annai environment; Application developers; Common user
                 interface; Distributed memory parallel processors;
                 Dynamic data distributions; Feedback; Functionality
                 enhancements; High Performance Fortran compiler;
                 High-level data-parallel programming; Interactive
                 performance monitor; Language extensions; Low-level
                 machine interface; Low-level message-passing
                 programming; Message Passing Interface; Performance
                 analyzer; Performance results; Portability; Portable
                 distributed parallel programming environment;
                 Source-level debugger; Target hardware architecture;
                 Tool prototypes; Unstructured problem parallelization",
  thesaurus =    "Distributed memory systems; FORTRAN; Message passing;
                 Parallel programming; Program compilers; Program
                 debugging; Program diagnostics; Programming
                 environments; Software performance evaluation; Software
                 portability; Software tools; User interfaces",
}

@InProceedings{Clemencon:1995:IRD,
  author =       "C. Clemencon and J. Fritscher and M. J. Meehan and R.
                 R{\"u}hl",
  title =        "An Implementation of Race Detection and Deterministic
                 Replay with {MPI}",
  crossref =     "Haridi:1995:EPP",
  pages =        "155--166",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Centro Svizzero de Calcolo Sci., Eidgenossische Tech.
                 Hochschule, Manno, Switzerland",
  classification = "C6110P (Parallel programming); C6110S (Software
                 metrics); C6115 (Programming support); C6140D (High
                 level languages); C6150C (Compilers, interpreters and
                 other processors)",
  conftitle =    "EURO-PAR '95. Parallel Processing. First International
                 EURO- PAR Conference. Proceedings",
  corpsource =   "Centro Svizzero de Calcolo Sci., Eidgenossische Tech.
                 Hochschule, Manno, Switzerland",
  keywords =     "Annai programming environment; Computational
                 efficiency; computational efficiency; Data-parallel
                 program; data-parallel program; Deterministic replay;
                 deterministic replay; FORTRAN; High Performance
                 Fortran; HPF; Integrated environment; integrated
                 environment; Joint CSCS-ETH/NEC Collaboration; message
                 passing; Message-passing program; message-passing
                 program; MPI; Parallel Debugging Tool; Parallel
                 language; parallel language; parallel programming;
                 parallelising compilers; Parallelized MPI program;
                 parallelized MPI program; PDT; Program debugging;
                 program debugging; Programming environment; programming
                 environment; programming environments; programming
                 languages; Race detection; race detection; Replaying
                 mechanism; replaying mechanism; software metrics;
                 Software performance; software performance; software
                 performance evaluation; Software tool; software tool;
                 Tracing; tracing",
  thesaurus =    "FORTRAN; Message passing; Parallel programming;
                 Parallelising compilers; Program debugging; Programming
                 environments; Programming languages; Software metrics;
                 Software performance evaluation",
  treatment =    "P Practical",
}

@InProceedings{Cooperman:1995:SBP,
  author =       "G. Cooperman",
  title =        "{STAR\slash MPI}: binding a parallel library to
                 interactive symbolic algebra systems",
  crossref =     "Levelt:1995:IIS",
  pages =        "126--132",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Coll. of Comput. Sci., Northeastern Univ., Boston, MA,
                 USA",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C7310 (Mathematics computing)",
  keywords =     "GCL; GNU Common LISP; Interactive symbolic algebra
                 systems; Mathematical group theory; Parallel library;
                 STAR/MPI; Symbolic algebra",
  thesaurus =    "Parallel programming; Software libraries; Symbol
                 manipulation",
}

@InProceedings{Cooperman:1995:SMB,
  author =       "Gene Cooperman",
  title =        "{STAR\slash MPI}: Binding a Parallel Library to
                 Interactive Symbolic Algebra Systems",
  crossref =     "Levelt:1995:IIS",
  pages =        "126--132",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This work is aimed at making parallel programming more
                 accessible to users of symbolic algebra systems and to
                 users of interactive languages in general. This is done
                 by integrating MPI (Message Passing Interface), a
                 portable, parallel message-passing library, with two
                 interactive languages: GCL (GNU Common LISP), and GAP.
                 The GAP system includes a general purpose language for
                 mathematical group theory, and LISP is the basis for
                 several general-purpose symbolic algebra systems. In
                 addition, a simple master-slave abstraction is written,
                 so that end-users need not learn any of the details of
                 the MPI function calls. This work is distinct from past
                 studies in that it provides the ability to
                 interactively create, test and modify a distributed
                 environment using the original interactive language and
                 a portable parallel library.",
  acknowledgement = ack-nhfb,
  affiliation =  "Northeastern Univ",
  affiliationaddress = "Boston, MA, USA",
  classification = "721.1; 722.2; 722.4; 723.1; 723.5; 921.1; C6110B
                 (Software engineering techniques); C6110P (Parallel
                 programming); C6115 (Programming support); C7310
                 (Mathematics computing)",
  conference =   "Proceedings of the 1995 International Symposium on
                 Symbolic and Algebraic Computation",
  conftitle =    "Proceedings of International Symposium on Symbolic and
                 Algebraic Computation. ISSAC '95",
  corpsource =   "Coll. of Comput. Sci., Northeastern Univ., Boston, MA,
                 USA",
  journalabr =   "Int Symp Symbol Algebraic Comput ISSAC Proc",
  keywords =     "Algebra; Computational methods; Computer programming;
                 Computer programming languages; Computer simulation;
                 Computer software; GCL; GNU Common LISP; Interactive
                 computer systems; Interactive languages; Interactive
                 symbolic algebra systems; interactive symbolic algebra
                 systems; Interfaces (computer); mathematical group
                 theory; Mathematical techniques; Message passing
                 interface; Parallel library; parallel library; Parallel
                 processing systems; parallel programming; software
                 libraries; STAR/MPI; symbol manipulation; symbolic
                 algebra; User interfaces",
  meetingaddress = "Montreal, Can",
  meetingdate =  "Jul 10--12 1995",
  meetingdate2 = "07/10--12/95",
  sponsororg =   "ACM",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Corno:1995:PTA,
  author =       "F. Corno and P. Prinetto and M. Rebaudengo and M.
                 {Sonza Reorda} and E. Veiluva",
  title =        "A {PVM} tool for automatic test generation on parallel
                 and distributed systems",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "39--44",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartimento di Autom. e Inf., Politecnico di Torino,
                 Italy",
  classification = "B1130B (Computer-aided circuit analysis and design);
                 B2210B (Printed circuit layout and design); C5210B
                 (Computer-aided logic design); C6110P (Parallel
                 programming); C6150N (Distributed systems software);
                 C7410D (Electronic engineering computing)",
  corpsource =   "Dipartimento di Autom. e Inf., Politecnico di Torino,
                 Italy",
  keywords =     "algorithm; ATPG; automatic test generation; Automatic
                 test generation; automatic test pattern; Automatic test
                 pattern generation; automatic test software; circuit
                 CAD; CM-5; DEC Alpha AXP farm; distributed programming;
                 Distributed programming; distributed system;
                 Distributed system; efficient algorithm; Efficient
                 algorithm; electric circuit; Electric circuit;
                 electrical circuit; Electrical circuit; electronic CAD;
                 Electronic CAD; electronic circuit; Electronic circuit;
                 GATTO*; generation; genetic; Genetic algorithm;
                 integrated circuit; large sequential circuits; Large
                 sequential circuits; logic CAD; logic testing;
                 parallel; parallel architectures; parallel programming;
                 Parallel programming; portability; Portability;
                 portable message-passing libraries; Portable
                 message-passing libraries; programming; PVM tool;
                 software libraries; testing; VLSI; VLSI technology",
  pubcountry =   "Germany",
  thesaurus =    "Automatic test software; Circuit CAD; Integrated
                 circuit testing; Logic CAD; Logic testing; Parallel
                 architectures; Parallel programming; Software
                 libraries; VLSI",
  treatment =    "P Practical",
  xxauthor =     "F. Corno and P. Prinetto and M. Rebaudeng and M.
                 {Sonza Reorda} and E. Veiluva",
}

@Article{DAmbra:1995:CBC,
  author =       "P. D'Ambra and G. Giunta",
  title =        "Concurrent banded {Cholesky} factorization on
                 workstation networks using {PVM}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "21",
  number =       "3",
  pages =        "487--494",
  day =          "10",
  month =        mar,
  year =         "1995",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartimento di Matematica e Applicazioni, Naples
                 Univ., Italy",
  classification = "C4140 (Linear algebra); C4240P (Parallel programming
                 and algorithm theory); C6110P (Parallel programming);
                 C7310 (Mathematics computing)",
  corpsource =   "Dipartimento di Matematica e Applicazioni, Naples
                 Univ., Italy",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "application programs; Application programs; banded
                 symmetric positive-definite matrix; Banded symmetric
                 positive-definite matrix; cluster; Cluster of
                 workstations; concurrent banded Cholesky factorization;
                 Concurrent banded Cholesky factorization; heterogeneous
                 processors; Heterogeneous processors; linear algebra;
                 mathematics computing; networks; of workstations;
                 optical fiber links; Optical fiber links; parallel
                 programming; Parallel Virtual Machine; software system;
                 Software system; virtual machines; workstation;
                 Workstation networks; workstations",
  pubcountry =   "Netherlands",
  thesaurus =    "Linear algebra; Mathematics computing; Parallel
                 programming; Virtual machines; Workstations",
  treatment =    "A Application; P Practical",
}

@InProceedings{Davies:1995:NPE,
  author =       "Gregory Davies and Norman Matloff",
  title =        "Network-Specific Performance Enhancements for {PVM}",
  crossref =     "IEEE:1995:PFI",
  pages =        "205--210",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "PVM, a message-passing software system for parallel
                 processing, is used on a wide variety of processor
                 platforms, but this portability restricts execution
                 speed. The work here will address this problem mainly
                 in the context of Ethernet-based systems, proposing two
                 PVM enhancements for such systems. The first
                 enhancement exploits the fact that an Ethernet has
                 broadcast capability. Since unenhanced PVM must, to
                 keep portability, avoid using broadcast, execution
                 speed is sacrificed. In addition, the larger the
                 system, the larger the sacrifice in speed. A solution
                 to this problem is presented. The second enhancement is
                 intended for use in applications in which many
                 concurrent tasks finish at the same time, and thus
                 simultaneously try to transmit to a master process. On
                 an Ethernet, this produces excessively long random
                 backoffs, reducing program speed. An enhancement,
                 termed `programmed backoff,' is proposed.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tandem Computers",
  affiliationaddress = "Cupertino, CA, USA",
  classification = "716.1; 722; 722.3; 722.4; 723; 922.2; C5440
                 (Multiprocessing systems); C5620L (Local area
                 networks); C6150N (Distributed systems software)",
  conference =   "Proceedings of the 4th IEEE International Symposium on
                 High Performance Distributed Computing",
  journalabr =   "IEEE Int Symp High Perform Distrib Comput Proc",
  keywords =     "Algorithms; Broadcast capability; Broadcasting;
                 Communication channels (information theory); Computer
                 hardware; Computer networks; Computer software
                 portability; Concurrent tasks; Data communication
                 systems; Ethernet-based systems; Hypercube systems,
                 Network-specific performance enhancements;
                 Message-passing software system; Parallel processing;
                 Parallel processing systems; Program speed; Programmed
                 backoff; PVM; Statistical methods",
  meetingaddress = "Washington, DC, USA",
  meetingdate =  "Aug 2--4 1995",
  meetingdate2 = "08/02--04/95",
  sponsor =      "IEEE",
  thesaurus =    "Local area networks; Message passing; Parallel
                 processing",
}

@InProceedings{Davies:1995:NSP,
  author =       "G. Davies and N. Matloff",
  title =        "Network-specific performance enhancements for {PVM}",
  crossref =     "IEEE:1995:PFI",
  pages =        "205--210",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5620L (Local area
                 networks); C6150N (Distributed systems software)",
  corpsource =   "Tandem Comput. Inc., Cupertino, CA, USA",
  keywords =     "broadcast capability; concurrent tasks;
                 Ethernet-based; local area networks; message passing;
                 message-passing; network-specific performance
                 enhancements; parallel processing; program; programmed
                 backoff; PVM; software system; speed; systems",
  sponsororg =   "IEEE Tech. Committee on Distrib. Process.; Northeast
                 Parallel Architectures Centre (NPAC) at Syracuse Univ.;
                 ACM SIGCOMM; Rome Lab",
  treatment =    "A Application; P Practical",
}

@InProceedings{Decker:1995:TDU,
  author =       "T. Decker and R. Diekmann and R. Luling and B.
                 Monien",
  title =        "Towards developing universal dynamic mapping
                 algorithms",
  crossref =     "IEEE:1995:PSI",
  pages =        "456--459",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Paderborn Univ.,
                 Germany",
  classification = "C5220P (Parallel architecture); C5620 (Computer
                 networks and techniques); C6150J (Operating systems)",
  keywords =     "Bidding-algorithms; Distributed runtime systems;
                 Dynamically generated tasks; Execution-times;
                 MIMD-system; MPI; Optimal K-values; PVM; Randomly
                 selected processors; Universal dynamic mapping
                 algorithms; Universally applicable strategy",
  thesaurus =    "Distributed processing; Resource allocation",
}

@TechReport{Dongarra:1995:IMS,
  author =       "Jack Dongarra and Steve W. Otto and Marc Snir and
                 David Walker",
  title =        "An Introduction to the {MPI Standard}",
  type =         "Technical report",
  number =       "CS-95-274",
  institution =  inst-UTK,
  address =      inst-UTK:adr,
  month =        jan,
  year =         "1995",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Appears in CACM \cite{Dongarra:1996:MPS}.",
  URL =          "http://www.netlib.org/tennessee/ut-cs-95-274.ps;
                 http://www.netlib.org/utk/papers/intro-mpi/intro-mpi.html;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/ut-cs-95-274.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Dongarra:1995:PBC,
  author =       "J. J. Dongarra and T. Hey",
  title =        "The {ParkBench} benchmark collection",
  journal =      j-SUPERCOMPUTER,
  volume =       "11",
  number =       "2-3",
  pages =        "94--114",
  month =        jun,
  year =         "1995",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  classification = "C5440 (Multiprocessing systems); C6150C (Compilers,
                 interpreters and other processors)",
  fjournal =     "Supercomputer",
  keywords =     "Application kernels; Compact research applications;
                 Hierarchical structure; Low-level benchmarks; ParkBench
                 benchmark collection; Performance characteristics;
                 Synthetic compiler benchmark suite",
  pubcountry =   "Netherlands",
  thesaurus =    "Parallel processing; Program compilers",
}

@InProceedings{Dowaji:1995:LBS,
  author =       "S. Dowaji and C. Roucairol",
  title =        "Load balancing strategy and priority of tasks in
                 distributed environments",
  crossref =     "IEEE:1995:CPI",
  pages =        "15--22",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. PRiSM, Univ. de Versailles-St-Quentin, France",
  classification = "C1160 (Combinatorial mathematics); C1180
                 (Optimisation techniques); C4240P (Parallel programming
                 and algorithm theory); C6150N (Distributed systems
                 software)",
  keywords =     "Branch and bound algorithms; Distributed environments;
                 Graph theory; Load balancing; Lower bound; VCP",
  thesaurus =    "Combinatorial mathematics; Distributed algorithms;
                 Optimisation; Resource allocation",
}

@Article{Dragovitsch:1995:PPS,
  author =       "P. Dragovitsch and X. Zhao and L. C. Dennis and G. A.
                 Riccardi",
  title =        "{PVMGeant} --- a Parallel Simulation Code for the
                 {CLAS} Detector at {CEBAF}",
  journal =      j-IJSAHPC,
  volume =       "9",
  number =       "2",
  pages =        "128--137",
  month =        "Summer",
  year =         "1995",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Tue Feb 18 09:07:32 MST 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib; UnCover
                 library database",
  abstract =     "Due to the need for extensive and detailed simulations
                 of the CEBAF Large Acceptance Spectrometer (CLAS), the
                 Monte-Carlo code CLASGeant was transferred to a
                 heterogeneous computing cluster and has been linked to
                 the Parallel Virtual Machine (PVM) message-passing
                 library. The resulting simulation package, PvmGeant,
                 achieves an almost linear speedup in physics-event
                 simulation. This article describes modifications to the
                 original GEANT code, its integration with PVM, and
                 performance tests that were conducted at the computing
                 cluster at The Supercomputing Computations Research
                 Institute at Florida State University. Particular
                 attention has been given to measuring the effect of
                 different data structures on the cost of network
                 communication between nodes.",
  acknowledgement = ack-nhfb,
  affiliation =  "Supercomput. Comput. Res. Inst., Florida State Univ.",
  affiliationaddress = "Tallahassee, FL, USA",
  classification = "722.4; 723.1; 723.2; 723.5; 922.2; 941.3",
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
  journalabr =   "Int J Supercomput Appl High Perform Comput",
  keywords =     "CEBAF large acceptance spectrometer (CLAS);
                 Computational complexity; Computer networks; Computer
                 simulation; Computer software; Computing cluster;
                 Continuous electron beam accelerator facility (CEBAF);
                 Data communication systems; Data structures; Message
                 passing library; Monte Carlo methods; Parallel
                 processing systems; Parallel virtual machine; Software
                 package CLASGeant; Software package PvmGeant;
                 Spectrometers",
}

@InProceedings{Edjlali:1995:DPP,
  author =       "G. Edjlali and G. Agrawal and A. Sussman and J.
                 Saltz",
  title =        "Data parallel programming in an adaptive environment",
  crossref =     "IEEE:1995:PIP",
  pages =        "827--832",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Maryland Univ., College Park,
                 MD, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  keywords =     "Adaptive environment; Communication patterns; Data
                 parallel programming; Data redistribution; Loop bounds;
                 Message passing; Multiblock Navier--Stokes solver;
                 Network of workstations; Performance results; PVM;
                 Runtime library; Runtime support",
  thesaurus =    "Message passing; Parallel programming; Programming
                 environments",
}

@Article{Fan:1995:DMP,
  author =       "W. C. Fan and J. A. {Halbleib, Sr.}",
  title =        "Distributed multitasking {ITS} with {PVM}",
  journal =      j-TRANS-AM-NUCL-SOC,
  volume =       "72",
  number =       "????",
  pages =        "146--147",
  month =        "????",
  year =         "1995",
  CODEN =        "TANSAO",
  ISSN =         "0003-018X",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sandia Nat. Labs., Albuquerque, NM, USA",
  classification = "A0250 (Probability theory, stochastic processes, and
                 statistics); A0540 (Fluctuation phenomena, random
                 processes, and Brownian motion); A2820H (Neutron
                 diffusion); C1140G (Monte Carlo methods); C7470
                 (Nuclear engineering computing)",
  conflocation = "Philadelphia, PA, USA; 25-29 June 1995",
  conftitle =    "1995 Annual Meeting of American Nuclear Society
                 (papers in summary form only received)",
  corpsource =   "Sandia Nat. Labs., Albuquerque, NM, USA",
  fjournal =     "Transactions of the American Nuclear Society",
  keywords =     "distributed multitasking ITS; Distributed multitasking
                 ITS; engineering computing; ITS Version 3.0; Monte
                 Carlo methods; neutron transport theory; nuclear; PVM
                 communication software; transport codes; Transport
                 codes",
  thesaurus =    "Monte Carlo methods; Neutron transport theory; Nuclear
                 engineering computing",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Fang:1995:PMS,
  author =       "Niandong Fang and H. Burkhart",
  title =        "{PEMPI} --- from {MPI} standard to programming
                 environment",
  crossref =     "IEEE:1995:PSP",
  pages =        "31--38",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Inf., Basel Univ., Switzerland",
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Dept. of Inf., Basel Univ., Switzerland",
  keywords =     "Basel Algorithm Classification Scheme; higher
                 abstractions; Higher abstractions; integrated
                 environment; Integrated environment; large scale
                 message passing applications; Large scale message
                 passing applications; machine best-fit implementation;
                 Machine best-fit implementation; message passing;
                 Message Passing Interface; message passing programs;
                 Message passing programs; message passing systems;
                 Message passing systems; MPI standard; parallel
                 programming; parallel programs; Parallel programs;
                 PEMPI; portability; Portability; programmability;
                 Programmability; programmer oriented abstractions;
                 Programmer oriented abstractions; programming
                 environment; Programming environment; programming
                 environments; software standards; software tools;
                 system- oriented level; System-oriented level; widely
                 used standard; Widely used standard",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Message passing; Parallel programming; Programming
                 environments; Software standards; Software tools",
  treatment =    "P Practical",
}

@InProceedings{Ferrari:1995:TDC,
  author =       "A. J. Ferrari and V. S. Sunderam",
  title =        "{TPVM}: distributed concurrent computing with
                 lightweight processes",
  crossref =     "IEEE:1995:PFI",
  pages =        "211--218",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Virginia Univ.,
                 Charlottesville, VA, USA",
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software)",
  keywords =     "Data dependencies; Data-driven scheduling model;
                 Distributed concurrent computing; Experimental
                 auxiliary subsystem; Explicit message passing model;
                 Library interface; Lightweight processes; Load balance;
                 Parallelism; Processor utilization; Scheduling;
                 SPMD-style algorithms; Threads-oriented PVM; TPVM",
  thesaurus =    "Message passing; Parallel processing; Scheduling",
}

@Article{Fineberg:1995:IMM,
  author =       "Samuel A. Fineberg",
  title =        "Implementing multidisciplinary and multi-zonal
                 applications using {MPI}",
  journal =      j-FRONTIERS-MASS-PAR-COMP-CONF-PROC,
  pages =        "496--503",
  month =        "????",
  year =         "1995",
  bibdate =      "Fri May 24 09:57:40 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95TH8024.",
  abstract =     "Multidisciplinary and multi-zonal applications are
                 codes where two or more distinct parallel programs or
                 copies of a single program are utilized to model a
                 single problem. To support such applications, a program
                 can be divided into several single program multiple
                 data stream (SPMD) applications, each of which solves
                 the equations for a single physical discipline or grid
                 zone. These applications are bound together to form a
                 single multidisciplinary or multi-zonal program in
                 which the constituent pans communicate via
                 point-to-point message passing routines. In this report
                 it is shown that the new Message Passing Interface
                 (MPI) standard is a viable portable library for
                 implementing the message passing portion of
                 multidisciplinary applications. Further, with the
                 extension of a portable loader, fully portable
                 multidisciplinary application programs can be
                 developed. Finally, the performance of MPI is compared
                 to that of some native message passing libraries. This
                 comparison shows that MPI can be implemented to deliver
                 performance commensurate with native message passing
                 libraries.",
  acknowledgement = ack-nhfb,
  affiliation =  "NASA Ames Research Cent",
  affiliationaddress = "Moffett Field, CA, USA",
  classification = "722.2; 722.3; 722.4; 723.1; 723.2; 921.6",
  conference =   "Proceedings of the 5th Symposium on the Frontiers of
                 Massively Parallel Computation",
  fjournal =     "Frontiers of Massively Parallel Computation ---
                 Conference Proceedings",
  journalabr =   "Front Massively Parallel Comput Conf Proc",
  keywords =     "Codes (symbols); Computational methods; Computer
                 software; Computer software portability; Data
                 communication systems; Data handling; Interfaces
                 (computer); Mathematical models; Message passing;
                 Multidisciplinary program; Multiprogramming; Multizonal
                 program; Parallel processing systems; Resource
                 allocation; Single program multiple data stream;
                 Storage allocation (computer); Supervisory and
                 executive programs",
  meetingaddress = "McLean, VA, USA",
  meetingdate =  "Feb 6--9 1995",
  meetingdate2 = "02/06--09/95",
  sponsor =      "IEEE Computer Society",
}

@InProceedings{Ford:1995:NNN,
  author =       "Brian Ford",
  title =        "The New {NAG} Numerical {PVM} Library (or {A} New
                 Parallel Numerical Library Based on {PVM})",
  crossref =     "IFIP:1995:KWC",
  pages =        "??--??",
  year =         "1995",
  bibdate =      "Wed Jan 24 07:11:31 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.nsc.liu.se/~boein/ifip/kyoto/workshop-info/proceedings/ford/ford1.html",
  acknowledgement = ack-nhfb,
}

@InProceedings{Franke:1995:AAV,
  author =       "E. A. Franke and S. D. Huffman and W. M. Carter and J.
                 P. Baumgartner and D. J. Wenzel",
  title =        "{AVTP} --- an architecture for visualization using
                 remote parallel\slash distributed computing",
  crossref =     "Grinstein:1995:VDE",
  journal =      j-PROC-SPIE,
  volume =       "2410",
  pages =        "230--237",
  year =         "1995",
  CODEN =        "PSISDG",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  ISSN-L =       "0277-786X",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Southwest Res. Inst., San Antonio, TX, USA",
  classification = "C6130B (Graphics techniques); C6150N (Distributed
                 systems software)",
  fjournal =     "Proceedings of the SPIE --- The International Society
                 for Optical Engineering",
  keywords =     "Advanced Visualization Technology Project; AVTP; Data
                 cache server; High speed data networks; Image
                 generation library; Image specification toolset;
                 Message passing; Parallel processor machines; PVM;
                 Remote computer resources; Remote distributed
                 computing; Remote parallel computing; Research and
                 development; Scalable computing; Shared memory;
                 Streamlines; Surfaces; System architectures; Vector
                 fields; Visualization architecture; Visualization
                 tools",
  thesaurus =    "Cache storage; Data visualisation; File servers;
                 Message passing; Multiprocessing programs",
}

@InProceedings{Franke:1995:MIS,
  author =       "H. Franke and P. Hochschild and P. Pattnaik and J.-P.
                 Prost and M. Snir",
  title =        "{MPI} on {IBM SP1\slash SP2}: current status and
                 future directions",
  crossref =     "IEEE:1995:PSP",
  pages =        "39--48",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  classification = "C5440 (Multiprocessing systems); C6110B (Software
                 engineering techniques); C6110P (Parallel programming);
                 C6150N (Distributed systems software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  keywords =     "distributed memory systems; future directions; Future
                 directions; IBM computers; IBM Scalable Power PARALLEL
                 1; IBM Scalable Power PARALLEL 2; IBM SP1/SP2; initial
                 performance measurements; Initial performance
                 measurements; message passing; MPI; native EUI library;
                 Native EUI library; parallel programming; prototype
                 implementation; Prototype implementation; software
                 libraries; software standards",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Distributed memory systems; IBM computers; Message
                 passing; Parallel programming; Software libraries;
                 Software standards",
  treatment =    "P Practical",
}

@TechReport{Franke:1995:MPEa,
  author =       "Hubertus Franke",
  title =        "{MPI} programming environment for {IBM SP1\slash
                 SP2}",
  type =         "Research report",
  number =       "RC 19991 (88480)",
  institution =  inst-IBM-WATSON,
  address =      inst-IBM-WATSON:adr,
  pages =        "9",
  year =         "1995",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "In this paper we discuss an implementation of the
                 Message Passing Interface standard (MPI) for the IBM
                 Scalable Power PARALLEL 1 and 2 (SP1, SP2). Key to a
                 reliable and efficient implementation of a message
                 passing library on these machines is the careful design
                 of a UNIX-Socket like layer in the user space with
                 controlled access to the communication adapters and
                 with adequate recovery and flow control. The
                 performance of this implementation is at the same level
                 as the IBM-proprietary message passing library (MPL).
                 We also show that in the IBM SP1 and SP2 we achieve
                 integrated tracing ability, where both system events,
                 such as context switches and page fault etc., and MPI
                 related activities are traced, with minimal overhead to
                 the application program, thus presenting application
                 programmers the trace of all the events that ultimately
                 affect efficiency of a parallel program.",
  acknowledgement = ack-nhfb,
  keywords =     "Parallel programming (Computer science)",
}

@InProceedings{Franke:1995:MPEb,
  author =       "Hubertus Franke and C. Eric Wu and Michel Riviere and
                 Pratap Pattnaik and Marc Snir",
  title =        "{MPI} Programming Environment for {IBM SP1\slash
                 SP2}",
  crossref =     "IEEE:1995:PIC",
  pages =        "127--135",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95CH35784.",
  abstract =     "In this paper we discuss an implementation of the
                 Message Passing Interface standard (MPI) for the IBM
                 Scalable Power PARALLEL 1 and 2 (SP1, SP2). Key to a
                 reliable and efficient implementation of a message
                 passing library on these machines is the careful design
                 of a UNIX-Socket like layer in the user space with
                 controlled access to the communication adapters and
                 with adequate recovery and flow control. The
                 performance of this implementation is at the same level
                 as the IBM-proprietary message passing library (MPL).
                 We also show that in the IBM SP1 and SP2 we achieve
                 integrated tracing ability, where both system events,
                 such as context switches and page fault etc., and MPI
                 related activities are traced, with minimal overhead to
                 the application program, thus presenting application
                 programmers the trace of all the events that ultimately
                 affect efficiency of a parallel program.",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM T. J. Watson Research Cent",
  affiliationaddress = "Yorktown Heights, NY, USA",
  classification = "722.2; 722.4; 723; 723.1; 723.1.1; C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  conference =   "Proceedings of the 15th International Conference on
                 Distributed Computing Systems",
  conftitle =    "Proceedings of 15th International Conference on
                 Distributed Computing Systems",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  journalabr =   "Proc Int Conf Distrib Comput Syst",
  keywords =     "adequate recovery; Adequate recovery; application
                 program interfaces; application programmers;
                 Application programmers; Application programming
                 interfaces; communication adapters; Communication
                 adapters; Computer architecture; Computer programming;
                 Computer software; Computer system recovery; Fault
                 tolerant computer systems; flow control; Flow control;
                 IBM Scalable Power PARALLEL 1; IBM SP1/SP2; integrated
                 tracing ability; Integrated tracing ability; Interfaces
                 (computer); message passing; Message passing interface
                 standard; message passing interface standard; Message
                 passing interface standard; Message passing library;
                 MPI programming environment; page fault; Page fault;
                 Parallel processing systems; parallel program; Parallel
                 program; parallel programming; Performance; Power
                 parallel system; programming environments; Software
                 engineering; Systems analysis; UNIX; UNIX- Socket like
                 layer; UNIX-Socket like layer",
  meetingaddress = "Vancouver, Can",
  meetingdate =  "May 30--Jun 2 1995",
  meetingdate2 = "05/30--06/02/95",
  sponsor =      "IEEE Computer Society",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  thesaurus =    "Application program interfaces; Message passing;
                 Parallel programming; Programming environments",
  treatment =    "A Application; P Practical",
}

@InProceedings{Ge:1995:DHA,
  author =       "Yuzhen Ge and L. T. Watson and E. G. {Collins, Jr.}",
  title =        "Distributed homotopy algorithms for {$ H^2 / H^\infty
                 $} controller synthesis",
  crossref =     "Bailey:1995:PSS",
  pages =        "84--89",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Butler Univ.,
                 Indianapolis, IN, USA",
  classification = "C1310 (Control system analysis and synthesis
                 methods); C3220 (Controllers); C4240P (Parallel
                 programming and algorithm theory)",
  keywords =     "Distributed homotopy algorithms; H/sup 2//H/sup
                 infinity / controller synthesis; High performance
                 computation; Industrial design environment; Jacobian
                 matrix computation; Mixed-norm controller synthesis
                 problem; Parallel Virtual Machine; UNIX workstations",
  thesaurus =    "Control system synthesis; Controllers; Distributed
                 algorithms",
}

@InProceedings{Gentzsch:1995:STP,
  author =       "W. Gentzsch and U. Block and F. Ferstl",
  title =        "Software tools for parallel computers and workstation
                 clusters",
  crossref =     "Ferenczi:1995:PAH",
  pages =        "23--42",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "GENIAS Software GmbH, Neutraubling, Germany",
  classification = "C5220P (Parallel architecture); C5430
                 (Microcomputers); C5440 (Multiprocessing systems);
                 C5540 (Terminals and graphic displays); C6115
                 (Programming support)",
  keywords =     "Benchmark results; EXPRESS; FORGE 90; GENIAS; Intel
                 iPSC/860; NCUBE/2; Parallel codes; Parallel computers;
                 Parsytec Multicluster; PVM/MPI; Software tools;
                 Workstation clusters; XHPF",
  thesaurus =    "Parallel processing; Software tools; Workstations",
}

@InProceedings{Gianuzzi:1995:UPI,
  author =       "V. Gianuzzi and F. Merani",
  title =        "Using {PVM} to Implement a Distributed Dependable
                 Simulation System",
  crossref =     "IEEE:1995:PEW",
  pages =        "529--535",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartimento di Inf. e Sci. dell'Inf., Genoa Univ.,
                 Italy",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5470 (Performance evaluation and testing); C6150N
                 (Distributed systems software)",
  corpsource =   "Dipartimento di Inf. e Sci. dell'Inf., Genoa Univ.,
                 Italy",
  keywords =     "algorithms; checkpoint-restart mechanism;
                 Checkpoint-restart mechanism; distributed; distributed
                 algorithms; Distributed algorithms; distributed
                 dependable simulation system; Distributed dependable
                 simulation system; fault tolerant; fault tolerant
                 computing; Fault tolerant mechanisms; high speed; High
                 speed interconnection; interconnection; mechanisms;
                 message; passing; PVM routines; simulations modelling;
                 Simulations modelling; synchronisation; Virtual Time",
  sponsororg =   "Euromicro; Assoc.Italiana per Inf. Calcolo Autom",
  thesaurus =    "Distributed algorithms; Fault tolerant computing;
                 Message passing; Synchronisation",
  treatment =    "P Practical",
}

@InProceedings{Gillich:1995:FPP,
  author =       "S. Gillich and B. Ries",
  title =        "Flexible, portable performance analysis for {PARMACS}
                 and {MPI}",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "937--??",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Greenfield:1995:OPS,
  author =       "J. Greenfield",
  title =        "An Overview of the {PVM} Software System",
  crossref =     "IEEE:1995:ISE",
  pages =        "17--23",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Dept. of Electr. and Comput. Eng., New Mexico Univ.,
                 Albuquerque, NM, USA",
  keywords =     "analysis; debugging; machine; message passing;
                 parallel processing; Parallel Virtual Machine;
                 performance; PVM; software system; virtual; virtual
                 machines; visualization tools",
  treatment =    "P Practical",
}

@InProceedings{Gropp:1995:DPM,
  author =       "W. Gropp and E. Lusk",
  title =        "Dynamic process management in an {MPI} setting",
  crossref =     "IEEE:1995:PSI",
  pages =        "530--533",
  year =         "1995",
  CODEN =        "PSPDF8",
  ISSN =         "1063-6374",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95TB8131.",
  abstract =     "We describe an architecture for the runtime
                 environment for parallel applications as prelude to
                 describing how parallel application might interface to
                 their environment in a portable way. We propose
                 extensions to the Message-Passing Interface (MPI)
                 Standard that provide for dynamic process management,
                 including spawning of new processes by a running
                 application and connection to existing processes to
                 support client\slash server applications. Such
                 extensions are needed if more of the runtime
                 environment for parallel programs is to be accessible
                 to MPI programs or to be themselves written using MPI.
                 The extensions proposed here are motivated by real
                 applications and fit cleanly with existing concepts of
                 MPI. No changes to the existing MPI Standard are
                 proposed, thus all present MPI programs will run
                 unchanged.",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab.",
  affiliationaddress = "Argonne, IL, USA",
  classification = "722.2; 722.3; 722.4; 723.1; 902.2; C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C6150N
                 (Distributed systems software)",
  conference =   "Proceedings of the 1995 7th IEEE Symposium on Parallel
                 and Distributed Processing",
  conftitle =    "Proceedings of Seventh IEEE Symposium on Parallel and
                 Distributed Processing",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  journalabr =   "IEEE Symp Parallel Distrib Process Proc",
  keywords =     "Client/server applications; Computer architecture;
                 Computer networks; Computer software; Computer systems
                 programming; Computer workstations; Data communication
                 systems; dynamic process management; Dynamic process
                 management; Interfaces (computer); message passing;
                 Message-passing interface; MPI setting; parallel
                 applications; Parallel applications; parallel
                 processing; Parallel processing systems; Parallel
                 programs; Process control; process management; Process
                 management; Real time systems; Resource allocation;
                 runtime environment; Runtime environment; Runtime
                 environments; Scheduling; Standards",
  meetingaddress = "San Antonio, TX, USA",
  meetingdate =  "Oct 25--28 1995",
  meetingdate2 = "10/25--28/95",
  sponsor =      "IEEE",
  sponsororg =   "IEEE Comput Soc. Tech. Committee on Comput.
                 Architecture; IEEE Comput. Soc. Tech. Committee on
                 Distributed Process.; IEEE Comput. Soc. Dallas
                 Chapter",
  thesaurus =    "Message passing; Parallel processing",
  treatment =    "P Practical",
}

@Article{Gropp:1995:EIS,
  author =       "W. D. Gropp and E. Lusk",
  title =        "Experiences with the {IBM SP1}",
  journal =      j-IBM-SYS-J,
  volume =       "34",
  number =       "2",
  pages =        "249--262",
  year =         "1995",
  CODEN =        "IBMSA7",
  ISSN =         "0018-8670",
  bibdate =      "Tue Mar 19 17:38:46 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.research.ibm.com/journal/sj34-2.html#seven",
  abstract =     "One of the first IBM parallel processing
                 computers---the SP1*---and the largest, with 128 nodes,
                 was installed in 1993 at Argonne National Laboratory.
                 It took only days, not months, to prepare for and
                 migrate applications to this parallel supercomputer,
                 demonstrating that high performance, parallelism, and
                 portability can coexist. This paper describes the early
                 experiences with the SP1 at Argonne, which provide
                 lessons for supercomputer system designers and users
                 alike. We explore what features of software technology
                 and system architecture enabled immediate and
                 successful use of the SP1. The paper concludes with a
                 brief indication of why the move to the SP2* software
                 environment using the SP2 communication adapters, the
                 use of the emerging Message-Passing Interface standard,
                 and the continued use of the SP1 processors have been
                 successful.",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6150N (Distributed systems software)",
  fjournal =     "IBM Systems Journal",
  keywords =     "High performance; IBM parallel processing computers;
                 IBM SP1; Message passing interface standard; Parallel
                 supercomputer; Parallelism; Portability; Software
                 technology; SP1 processors; SP2 communication adapters;
                 SP2 software environment; Supercomputer system
                 designers; System architecture",
  language =     "English",
  pubcountry =   "USA",
  thesaurus =    "IBM computers; Message passing; Parallel
                 architectures; Parallel machines; Parallel
                 programming",
}

@InProceedings{Gropp:1995:IMM,
  author =       "W. Gropp and E. Lusk",
  title =        "Implementing {MPI}: the 1994 {MPI Implementors'
                 Workshop}",
  crossref =     "IEEE:1995:PSP",
  pages =        "55--59",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "implementation process; Implementation process;
                 message passing; MPI implementation effort; parallel
                 computing; Parallel computing; parallel library;
                 Parallel library; parallel programming; software
                 libraries; software standards; standard message-passing
                 library interface; Standard message-passing library
                 interface; subroutines",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Message passing; Parallel programming; Software
                 libraries; Software standards; Subroutines",
  treatment =    "P Practical",
}

@InProceedings{Gropp:1995:MGX,
  author =       "W. Gropp and E. Karrels and E. Lusk",
  title =        "{MPE} graphics-scalable {X11} graphics in {MPI}",
  crossref =     "IEEE:1995:PSP",
  pages =        "49--54",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  classification = "C6110P (Parallel programming); C6130B (Graphics
                 techniques); C6150N (Distributed systems software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "communication patterns; Communication patterns;
                 communication traffic; Communication traffic; computer
                 graphics; library based message passing; Library based
                 message passing; message passing; MPE graphics; MPI;
                 MPI implementation; MPI message passing standard; MPI
                 Standard; parallel graphics library; Parallel graphics
                 library; parallel graphics operations; Parallel
                 graphics operations; parallel graphics routines;
                 Parallel graphics routines; parallel programming;
                 parallel programs; Parallel programs; parallel
                 semantics; Parallel semantics; programming libraries;
                 Programming libraries; scalable X11 graphics; Scalable
                 X11 graphics; semantics; Semantics; software standards;
                 subroutines; user control; User control; X-based
                 parallel graphics library",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Computer graphics; Message passing; Parallel
                 programming; Software standards; Subroutines",
  treatment =    "P Practical",
}

@InProceedings{Gropp:1995:MMI,
  author =       "W. Gropp and E. Lusk",
  title =        "The {MPI} Message-Passing Interface Standard: Overview
                 and Status",
  crossref =     "Dongarra:1995:HPC",
  pages =        "265--270",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Guan:1995:SCC,
  author =       "Xiaojun Guan and Richard J. Mural and Edward C.
                 Uberbacher",
  title =        "Sequence comparison on a cluster of workstations using
                 the {PVM} system",
  crossref =     "IEEE:1995:PIP",
  pages =        "190--195",
  year =         "1995",
  CODEN =        "PSPDF8",
  DOI =          "https://doi.org/10.1109/IPPS.1995.395931",
  ISSN =         "1063-6374",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Sequence comparison is one of the most important tools
                 in molecular biology research. As the amount of DNA
                 data increases rapidly, efficient sequence comparison
                 algorithms are essential in studying newly discovered
                 sequences. We have implemented a distributed sequence
                 comparison algorithm by Smith and Waterman on a cluster
                 of workstations using the PVM paradigm. This
                 implementation has achieved similar performance to the
                 Intel iPSC\slash 860 Hypercube, a massively parallel
                 computer. The distributed Smith-Waterman algorithm
                 serves as a search tool for two Internet servers GRAIL
                 and GENQUEST. This paper describes the implementation
                 and the performance of the algorithm.",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Natl Lab",
  affiliationaddress = "Oak Ridge, TN, USA",
  classification = "461.2; 721.1; 722.4; 723.1; C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C5620W
                 (Other computer networks); C7330 (Biology and medical
                 computing)",
  conference =   "Proceedings of the IEEE 9th International Parallel
                 Processing Symposium",
  corpsource =   "Div. of Comput. Sci. and Math., Oak Ridge Nat. Lab.,
                 TN, USA",
  journalabr =   "IEEE Symp Parallel Distrib Process Proc",
  keywords =     "Algorithms; cluster of workstations; Cluster of
                 workstations; Computational complexity; Computer
                 software; Computer workstations; DNA; DNA data; DNA
                 sequences; GENQUEST; GRAIL; hypercube; hypercube
                 networks; Intel iPSC/860; Intel iPSC/860 hypercube;
                 Internet; Internet servers; Internet servers GRAIL;
                 massively parallel computer; Massively parallel
                 computer; medical computing; molecular biology
                 research; Molecular biology research; molecular
                 biophysics; Parallel processing systems; Parallel
                 virtual machine; performance; Performance; PVM system;
                 sequence comparison; Sequence comparison; Smith
                 Waterman algorithm",
  meetingaddress = "Santa Barbara, CA, USA",
  meetingdate =  "Apr 25--28 1995",
  meetingdate2 = "04/25--28/95",
  sponsor =      "IEEE",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Parallel
                 Process",
  thesaurus =    "DNA; Hypercube networks; Internet; Medical computing;
                 Molecular biophysics",
  treatment =    "A Application; P Practical",
}

@InProceedings{Guarracino:1995:PMB,
  author =       "M. R. Guarracino and F. Perla",
  title =        "A parallel modified block {Lanczos} algorithm for
                 distributed memory architectures",
  crossref =     "IEEE:1995:PEW",
  pages =        "424--431",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dipartimento di Matematica e Applicazioni, Naples
                 Univ., Italy",
  classification = "C4140 (Linear algebra); C4240P (Parallel programming
                 and algorithm theory); C6110P (Parallel programming)",
  keywords =     "Block column wrap-around matrices; Block Lanczos
                 algorithm; Distributed memory architectures;
                 Eigenproblems; Load-balancing; Parallel block Lanczos
                 algorithm; Parallel software",
  thesaurus =    "Distributed memory systems; Eigenvalues and
                 eigenfunctions; Matrix algebra; Parallel algorithms",
}

@InProceedings{Hardwick:1995:PVL,
  author =       "J. C. Hardwick",
  title =        "Porting a vector library: a comparison of {MPI},
                 {Paris}, {CMMD} and {PVM}",
  crossref =     "IEEE:1995:PSP",
  pages =        "68--77",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6140D (High level languages)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  keywords =     "CM-2; CM-5; CMMD; compiler target; Compiler target;
                 Cray C90; debugging; Debugging; message passing; MPI;
                 NESL; nested data-parallel languages; Nested
                 data-parallel languages; parallel; parallel languages;
                 parallel programming; parallel vector library CVL;
                 Parallel vector library CVL; Paris; portable MPI
                 implementation; Portable MPI implementation; Proteus;
                 PVM; RISC based MPP architectures; software libraries;
                 subroutines; vector library CVL; vector library
                 porting; Vector library porting; vector processor
                 systems",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Message passing; Parallel languages; Parallel
                 programming; Software libraries; Subroutines; Vector
                 processor systems",
  treatment =    "P Practical",
}

@InProceedings{Hariri:1995:STE,
  author =       "S. Hariri and Sung-Yong Park and R. Reddy and M.
                 Subramanyan and R. Yadav and G. C. Fox and M.
                 Parashar",
  title =        "Software tool evaluation methodology",
  crossref =     "IEEE:1995:PIC",
  pages =        "3--10",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Northeast Parallel Archit. Center, Syracuse Univ., NY,
                 USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  keywords =     "Alpha cluster; ATM; Distributed computing software;
                 Distributed systems platforms; Ethernet; Express; FDDI;
                 IBM-SP1; Message passing tools; Multi-level evaluation
                 methodology; P4; Parallel computing software;
                 Programming paradigms; PVM; Software tool evaluation
                 methodology; SUN workstations",
  thesaurus =    "Message passing; Parallel programming; Software
                 performance evaluation; Software tools",
}

@InProceedings{Hausner:1995:EIP,
  author =       "M. Hausner and M. Burrows and C. A. Thekkath",
  title =        "Efficient implementation of {PVM} on the {AN2 ATM}
                 network",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "562--569",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. fur Computersyst., Eidgenossische Tech.
                 Hochschule, Zurich, Switzerland",
  classification = "B6150C (Communication switching); B6210L (Computer
                 communications); C5620L (Local area networks); C6115
                 (Programming support); C6150N (Distributed systems
                 software)",
  corpsource =   "Inst. fur Computersyst., Eidgenossische Tech.
                 Hochschule, Zurich, Switzerland",
  keywords =     "Alpha workstations; AN2 ATM network; asynchronous
                 transfer mode; ATM link bandwidth; coarse-grained;
                 Coarse-grained multicomputer; end-to-end PVM
                 communication performance; End-to-end PVM communication
                 performance; environments; high-speed ATM network;
                 High-speed ATM network; high-speed network; High-speed
                 network; local area networks; multicomputer;
                 programming; PVM environment; workstation cluster;
                 Workstation cluster; workstations",
  pubcountry =   "Germany",
  thesaurus =    "Asynchronous transfer mode; Local area networks;
                 Programming environments; Workstations",
  treatment =    "P Practical",
}

@InProceedings{Hoekstra:1995:CPP,
  author =       "A. G. Hoekstra and F. {Van der Linden} and P. M. A.
                 Sloot and L. O. Hertzberger",
  title =        "Comparing the {Parix} and {PVM} parallel programming
                 environments",
  crossref =     "Fritzson:1995:PPA",
  pages =        "288--292",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6110B (Software engineering
                 techniques); C6110P (Parallel programming); C6115
                 (Programming support); C6150N (Distributed systems
                 software)",
  corpsource =   "Parallel Sci. Comput. and Simulation Group, Amsterdam
                 Univ., Netherlands",
  keywords =     "communication capabilities; complexity analysis;
                 computational complexity; development; environments;
                 floating; floating point arithmetic; functionality;
                 generic; global communication times; native parallel
                 programming environments; parallel architectures;
                 parallel programming; parallel programming
                 environments; Parix parallel; Parsytec GCel; Parsytec
                 PowerXplorer; performance; performance penalties; point
                 communication times; point performance; point to;
                 portability; portable parallel program; PowerPC chip;
                 programmability; programming; programming environments;
                 PVM parallel programming environments; software;
                 software performance evaluation; software tools;
                 support; time; tool; transputer systems",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
  xxauthor =     "A. G. Hoekstra and P. M. A. Sloot and L. O.
                 Hertzberger",
  xxcrossref =   "VanKatwijk:1995:AAC",
}

@Article{Hollerbach:1995:FDA,
  author =       "Rainer Hollerbach",
  title =        "Fast dynamo action in spherical geometry: Numerical
                 calculations using parallel virtual machines",
  journal =      j-COMPUT-PHYS,
  volume =       "9",
  number =       "4",
  pages =        "460--??",
  month =        jul,
  year =         "1995",
  CODEN =        "CPHYE2",
  DOI =          "https://doi.org/10.1063/1.168547",
  ISSN =         "0894-1866 (print), 1558-4208 (electronic)",
  ISSN-L =       "0894-1866",
  bibdate =      "Wed Apr 10 08:45:55 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computphys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://aip.scitation.org/doi/10.1063/1.168547",
  acknowledgement = ack-nhfb,
  ajournal =     "Comput. Phys",
  fjournal =     "Computers in Physics",
  journal-URL =  "https://aip.scitation.org/journal/cip",
}

@InProceedings{Hondroudakis:1995:PEV,
  author =       "A. Hondroudakis and R. Procter and K. Shanmugam",
  title =        "Performance evaluation and visualization with
                 {VISPAT}",
  crossref =     "Malyshkin:1995:PCT",
  pages =        "180--185",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Edinburgh Univ., UK",
  classification = "C6110P (Parallel programming); C6110V (Visual
                 programming); C6115 (Programming support)",
  keywords =     "Graphical front end; Message passing; MPI; Parallel
                 programs; Performance analysis; Program execution;
                 VISPAT; Visualization",
  thesaurus =    "Data visualisation; Parallel programming; Software
                 performance evaluation; Software tools; Visual
                 programming",
}

@Article{Hong:1995:PNP,
  author =       "Lin Hong and Chen Huaping",
  title =        "{PVM} and network parallel computing",
  journal =      j-MINI-MICRO-SYSTEMS,
  volume =       "16",
  number =       "2",
  pages =        "53--58",
  month =        feb,
  year =         "1995",
  CODEN =        "XWJXEH",
  ISSN =         "1000-1220",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci. and Technol., Univ. of Sci. and
                 Technol. of China, Hefei, China",
  classification = "C6150N (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci. and Technol., Univ. of Sci. and
                 Technol. of China, Hefei, China",
  fjournal =     "Mini-Micro Systems",
  keywords =     "computing model; Computing model; load balancing; Load
                 balancing; message passing; Message passing; network
                 parallel computing; Network parallel computing;
                 parallel granularity; Parallel granularity; parallel
                 processing; programming methodology; Programming
                 methodology; PVM; resource allocation; software
                 environment; Software environment; virtual machines",
  language =     "Chinese",
  pubcountry =   "China",
  thesaurus =    "Message passing; Parallel processing; Resource
                 allocation; Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Hui:1995:SPS,
  author =       "Chi-Chung Hui and Mounir Hamdi and Ishfaq Ahmad",
  title =        "Software platform for solving {PDEs} on distributed
                 systems: Implementation issues and performance
                 prediction",
  crossref =     "IEEE:1995:PNA",
  pages =        "383--388",
  year =         "1995",
  CODEN =        "PSICD2",
  ISSN =         "0730-6512",
  bibdate =      "Fri May 24 09:58:00 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95CB35838.",
  abstract =     "This paper describes the implementation and
                 performance of a parallel platform for solving partial
                 differential equations (PDEs) on distributed systems.
                 The platform has been implemented using PVM for a
                 network of workstations. It allows the inclusion of a
                 wide range of parameters and programming aids. The PDEs
                 are specified in the form of finite difference
                 equations. With a given set of parameters and a
                 partitioning strategy, the platform provides facilities
                 to record and predict the performance of an application
                 before running it. The performance prediction model
                 helps the user to identify the major bottlenecks of the
                 platform such that by reducing them, the speedup can be
                 improved. We also present analysis of various factors
                 that can have drastic effect on the speedup, which
                 allows the user to tune a number of parameters to
                 maximize the performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hong Kong Univ of Science and Technology",
  affiliationaddress = "Kowloon, Hong Kong",
  classification = "722.2; 722.3; 722.4; 723.1; 921.2; 921.6; C4170
                 (Differential equations); C5620L (Local area networks);
                 C6150N (Distributed systems software)",
  conference =   "Proceedings of the 19th Annual International Computer
                 Software and Applications Conference COMPSAC '95",
  journalabr =   "Proc IEEE Comput Soc Int Comput Software Appl Conf",
  keywords =     "Application; Asynchronous communication library
                 routines; Bottlenecks; Computer software; Computer
                 workstations; Data communication systems; Distributed
                 systems; Finite difference equations; Finite difference
                 method; Mathematical models; Parallel platform;
                 Parallel virtual machine (PVM) system; Partial
                 differential equation solving; Partial differential
                 equations; Partitioning strategy; Performance
                 prediction; Performance recording; Programming aids;
                 PVM; Software platform; Speedup, Parallel processing
                 systems; Workstation network",
  meetingaddress = "Dallas, TX, USA",
  meetingdate =  "Aug 9--11 1995",
  meetingdate2 = "08/09--11/95",
  sponsor =      "IEEE",
  thesaurus =    "Finite difference methods; Local area networks;
                 Parallel processing; Partial differential equations;
                 Software performance evaluation; Workstations",
}

@MastersThesis{Humphres:1995:LBE,
  author =       "Christopher Wade Humphres",
  title =        "A load balancing extension for the {PVM} software
                 system",
  type =         "M.E.E. thesis",
  school =       inst-UAL-EE,
  address =      inst-UAL-EE:adr,
  pages =        "viii + 98",
  year =         "1995",
  bibdate =      "Mon Jan 15 16:50:57 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer networks; Parallel computers.",
}

@Article{Hungenahally:1995:PIQ,
  author =       "A. Hungenahally and A. Suresh",
  title =        "{PVM} implementation of quadtree building algorithms
                 on {SIMD} hypercube system",
  journal =      j-IEEE-INT-CONF-ALG-ARCH-PAR-PROC,
  volume =       "2",
  pages =        "855--858",
  month =        "????",
  year =         "1995",
  bibdate =      "Fri May 24 09:58:00 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95TH0682-5.",
  abstract =     "Representation of Data using hierarchical data
                 structures is commonly used in applications such as
                 Computer graphics, Digital image processing, Computer
                 Vision and techniques are being evolved for efficient
                 representation of these data. Transforming bilevel
                 images to linear quadtrees is a way of representing the
                 high-volume data. In this paper, the preliminary
                 investigation and results thus obtained for
                 transforming binary images to linear quadtrees using
                 Parallel Virtual Machine System Software are presented.
                 Single Instruction Multiple Data hypercube algorithms
                 implemented using PVM software was tested under DOS
                 operating system on IBM compatible PCs. The quadtree
                 algorithm generates locational codes in pre-order and
                 generally runs in O(log n) time and this paper tested
                 the feasibility of achieving this time for an SIMD
                 machine.",
  acknowledgement = ack-nhfb,
  affiliation =  "Griffith Univ",
  affiliationaddress = "Brisbane, Aust",
  classification = "722.4; 723; 723.2",
  conference =   "Proceedings of the IEEE 1st International Conference
                 on Algorithms and Architectures for Parallel
                 Processing. Part 2 (of 2)",
  fjournal =     "IEEE International Conference on Algorithms and
                 Architectures for Parallel Processing",
  journalabr =   "IEEE Int Conf Algorithms Archit Parall Process",
  keywords =     "Codes (symbols); Computer software; Data structures;
                 DOS; Hierarchical data structures; Hypercube; Image
                 processing; Parallel algorithms; Parallel processing
                 systems; Parallel virtual machine; Personal computers;
                 Quadtree; Single instruction multiple data",
  meetingaddress = "Brisbane, Aust",
  meetingdate =  "Apr 19--21 1995",
  meetingdate2 = "04/19--21/95",
  sponsor =      "IEEE",
}

@Article{Ingle:1995:MAS,
  author =       "N. K. Ingle and T. J. Mountziaris",
  title =        "A multifrontal algorithm for the solution of large
                 systems of equations using network-based parallel
                 computing",
  journal =      j-COMP-CHEM-ENG,
  volume =       "19",
  number =       "6-7",
  pages =        "671--681",
  month =        jun # "--" # jul,
  year =         "1995",
  CODEN =        "CCENDW",
  ISSN =         "0098-1354",
  ISSN-L =       "0098-1354",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Chem. Eng., State Univ. of New York, Buffalo,
                 NY, USA",
  classification = "C4140 (Linear algebra); C4160 (Numerical integration
                 and differentiation); C4170 (Differential equations);
                 C4185 (Finite element analysis); C4240P (Parallel
                 programming and algorithm theory); C7320 (Physics and
                 chemistry computing)",
  fjournal =     "Computers \& Chemical Engineering",
  keywords =     "Chemical vapor deposition; Distributed computing
                 environment; Finite element analysis; Flow;
                 Granularity; Heat transfer problem; In-core
                 computations; Intrinsic fault tolerance capabilities;
                 Large sparse equation systems; Multifrontal algorithm;
                 Network-based parallel computing; Networked
                 workstations; Out-of-core computations; Parallel
                 Virtual Machine software; Performance; Processors;
                 Reaction processes; Speedups; Thin films; Transport
                 processes",
  pubcountry =   "UK",
  thesaurus =    "Chemical reactions; Chemical vapour deposition;
                 Chemically reactive flow; Chemistry computing;
                 Differential equations; Finite element analysis; Heat
                 transfer; Integration; Parallel algorithms; Physics
                 computing; Software fault tolerance; Sparse matrices;
                 Thin films; Transport processes; Workstations",
}

@TechReport{Jann:1995:AMP,
  author =       "Joefon Jann and Hubertus Franke",
  title =        "Analysis of an {MPI} program using {UTE} on the {IBM
                 SP2}",
  type =         "Research report",
  number =       "RC 20085 (88832)",
  institution =  inst-IBM-WATSON,
  address =      inst-IBM-WATSON:adr,
  pages =        "11",
  year =         "1995",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We describe an implementation of a 2D-FFT
                 (Complex-Complex) program in MPI-F on the SP2 and show
                 its actual performance. The purpose of this paper is to
                 illustrate how we use the new tracing utility UTE/MPI
                 provided in MPI-F to verify the correctness of our
                 algorithm, to provide timing statistics summaries, and
                 to unravel other system activities, often unexpected by
                 the user, that affect the total elapsed time of the
                 program.",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors",
}

@Article{Jeremiassen:1995:RFS,
  author =       "T. E. Jeremiassen and S. J. Eggers",
  title =        "Reducing false sharing on shared memory
                 multiprocessors through compile time data
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "30",
  number =       "8",
  pages =        "179--188",
  month =        aug,
  year =         "1995",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 7 07:51:54 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We have developed compiler algorithms that analyze
                 explicitly parallel programs and restructure their
                 shared data to reduce the number of false sharing
                 misses. The algorithms analyze per-process shared data
                 accesses, pinpoint the data structures that are
                 susceptible to false sharing and choose an appropriate
                 transformation to reduce it. The transformations either
                 group data that is accessed by the same processor or
                 separate individual data items that are shared. We
                 evaluates that technique. We show through simulation
                 that our analysis successfully identifies the data
                 structures that are responsible for most false sharing
                 misses, and then transforms them without unduly
                 decreasing spatial locality. The reduction in false
                 sharing positively impacts both execution time and
                 program scalability when executed on a KSR2. Both
                 factors combine to increase the maximum achievable
                 speedup for all programs, more than doubling it for
                 several. Despite being able to only approximate actual
                 inter-processor memory accesses, the compiler-directed
                 transformations always outperform programmer efforts to
                 eliminate false sharing.",
  acknowledgement = ack-nhfb,
  affiliation =  "AT and T Bell Labs., Murray Hill, NJ, USA",
  classification = "C6120 (File organisation); C6150C (Compilers,
                 interpreters and other processors); C6150N (Distributed
                 systems software)",
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Compile time data transformations; Compiler
                 algorithms; Data structures; Execution time; False
                 sharing; False sharing misses; Inter-processor memory
                 access; KSR2; Maximum achievable speedup; Parallelizing
                 compilers; Program scalability; Shared data access;
                 Shared memory multiprocessors; Simulation; Spatial
                 locality",
  thesaurus =    "Data structures; Parallel programming; Program
                 compilers; Shared memory systems; Virtual machines",
}

@Article{Jin:1995:LTP,
  author =       "Lan Jin and Lan Yang",
  title =        "A laboratory for teaching parallel computing on
                 parallel structures",
  journal =      j-SIGCSE,
  volume =       "27",
  number =       "1",
  pages =        "71--75",
  month =        mar,
  year =         "1995",
  CODEN =        "SIGSD3",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., California State Univ., Fresno,
                 CA, USA",
  classification = "C0220 (Computing education and training); C5220P
                 (Parallel architecture); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C7430 (Computer
                 engineering)",
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
  keywords =     "Hardware level; Message-passing programming teaching;
                 Multi-computer; Parallel computing teaching laboratory;
                 Parallel processing; Parallel structure; Parallel
                 systems; Parallel Virtual Machine; PVM;
                 Reconfiguration; Software level; Structural
                 implementation",
  thesaurus =    "Computer science education; Laboratories; Message
                 passing; Parallel machines; Parallel programming;
                 Reconfigurable architectures; Teaching",
}

@InProceedings{Juric:1995:UPV,
  author =       "M. Juric and W. D. Potter and M. Plaksin",
  title =        "Using the {Parallel Virtual Machine} for hunting
                 snake-in-the-box codes",
  crossref =     "Arabnia:1995:TRA",
  pages =        "97--102",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci. and Inf. Syst., DePaul Univ.,
                 Chicago, IL, USA",
  classification = "C1180 (Optimisation techniques); C4230M
                 (Multiprocessor interconnection); C6150E (General
                 utility programs); C6150G (Diagnostic, testing,
                 debugging and evaluating systems); C6150N (Distributed
                 systems software)",
  corpsource =   "Dept. of Comput. Sci. and Inf. Syst., DePaul Univ.,
                 Chicago, IL, USA",
  keywords =     "adapted code; Adapted code; algorithm; combinatorial
                 explosion; Combinatorial explosion; genetic; Genetic
                 algorithm; genetic algorithms; hypercube networks;
                 maximum length snake; Maximum length snake;
                 multiprocessing; parallel; parallel machines; Parallel
                 single processor machine cluster; Parallel Virtual
                 Machine; programs; PVM software package; single
                 processor machine cluster; snake-in-the-box code
                 hunting; Snake-in-the-box code hunting; system
                 monitoring; systems; transputer; utility programs;
                 virtual machines",
  pubcountry =   "Netherlands",
  thesaurus =    "Genetic algorithms; Hypercube networks;
                 Multiprocessing programs; Parallel machines; System
                 monitoring; Transputer systems; Utility programs;
                 Virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Kalns:1995:DPD,
  author =       "E. T. Kalns and L. M. Ni",
  title =        "{DaReL}: a portable data redistribution library for
                 distributed-memory machines",
  crossref =     "IEEE:1995:PSP",
  pages =        "78--87",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  classification = "C5440 (Multiprocessing systems); C6110B (Software
                 engineering techniques); C6110P (Parallel programming);
                 C6140D (High level languages); C6150N (Distributed
                 systems software)",
  keywords =     "Algorithm computation; Algorithm phases; DaReL; Data
                 decomposition; Data exchange; Data parallel Fortran
                 languages; Distributed memory platforms;
                 Distributed-memory machines; High Performance Fortran;
                 HPF; IBM SP-1; Message passing standard; MPI
                 primitives; Multi-dimensional data redistribution;
                 Portable data redistribution library; Processor
                 memories; Program overhead; Regular distribution
                 patterns; Run-time data redistribution; Run-time data
                 redistribution primitives",
  thesaurus =    "Distributed memory systems; FORTRAN; Message passing;
                 Parallel languages; Parallel programming; Software
                 libraries; Software portability; Software standards;
                 Subroutines",
}

@InProceedings{Katkere:1995:VBW,
  author =       "A. Katkere and J. Schlenzig and R. Jain",
  title =        "{VRML-Based WWW} Interface to {MPI} Video",
  crossref =     "Nadeau:1995:SVR",
  pages =        "25--31, 137",
  month =        "????",
  year =         "1995",
  bibdate =      "Thu Mar 28 05:45:25 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Visual Comput. Lab., California Univ., San Diego, La
                 Jolla, CA, USA",
  keywords =     "SGML; Virtual Reality Modeling Language; VRML",
  xxpages =      "25--32",
}

@InProceedings{Kauranne:1995:OHM,
  author =       "T. Kauranne and J. Oinonen and S. Saarinen and O.
                 Serimaa and J. Hietaniemi",
  title =        "The operational {HIRLAM} 2 model on parallel computers
                 (weather forecasting)",
  crossref =     "Hoffmann:1995:CAP",
  pages =        "63--74",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Joensuu Univ., Finland",
  classification = "A9260X (Weather analysis and prediction); C4185
                 (Finite element analysis); C6110P (Parallel
                 programming); C7340 (Geophysics computing)",
  keywords =     "Atmosphere; Binary GRIB files; Distributed memory
                 computers; Helmholtz equation solver; Maintainability;
                 Message passing interface; Meteorology; Numerical
                 model; Operational HIRLAM 2 model; Parallel
                 programming; Portability; Reproducibility;
                 Semi-implicit Eulerian finite difference method; Serial
                 code noninterference; Transposition strategy; Weather
                 forecasting",
  thesaurus =    "Digital simulation; Distributed processing; Finite
                 difference methods; Finite element analysis; Geophysics
                 computing; Message passing; Numerical analysis;
                 Parallel processing; Parallel programming; Software
                 maintenance; Software portability; Weather
                 forecasting",
}

@InProceedings{Klingebiel:1995:COD,
  author =       "P. Klingebiel and R. Diekmann and U. Lefarth and M.
                 Fischer and J. Seuss",
  title =        "{CAMeL\slash PVM}: an open, distributed {CAE}
                 environment for modelling and simulating mechatronic
                 systems",
  crossref =     "Breitenecker:1995:ESC",
  pages =        "645--650",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Mechatronics Lab., Paderborn Univ., Germany",
  classification = "C6150N (Distributed systems software); C7440 (Civil
                 and mechanical engineering computing)",
  keywords =     "Ada tasking; Automatic load balancing procedures;
                 CAMeL/PVM; Channels; Communication management;
                 Computer-aided engineering design environment;
                 Computer-Aided Mechatronic Laboratory; Computer-aided
                 modelling; Heterogeneous workstation clusters;
                 Mechatronic systems simulation; Message-passing
                 environment; Open distributed CAE environment; Parallel
                 Virtual Machine; Process management; Program modules;
                 Unix-based extension",
  thesaurus =    "Computer aided engineering; Digital simulation;
                 Mechanical engineering computing; Mechatronics; Message
                 passing; Open systems; Parallel processing; Resource
                 allocation; Unix; Virtual machines",
}

@InProceedings{Klingebiel:1995:CPO,
  author =       "P. Klingebiel and R. Diekmann and U. Lefarth and M.
                 Fischer and J. Seuss",
  title =        "{CAMeL\slash PVM}: An Open, Distributed {CAE}
                 Environment for Modelling and Simulating Mechatronic
                 Systems",
  crossref =     "Breitenecker:1995:ESC",
  pages =        "645--650",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems software); C7440 (Civil
                 and mechanical engineering computing)",
  corpsource =   "Mechatronics Lab., Paderborn Univ., Germany",
  keywords =     "Ada tasking; aided modelling; automatic load
                 balancing; CAMeL/PVM; channels; communication; computer
                 aided engineering; computer-; Computer-Aided;
                 computer-aided engineering design environment; digital
                 simulation; engineering computing; extension;
                 heterogeneous workstation clusters; management;
                 mechanical; Mechatronic Laboratory; mechatronic systems
                 simulation; mechatronics; message passing;
                 message-passing environment; open; open distributed CAE
                 environment; Parallel; parallel processing; procedures;
                 process management; program modules; resource
                 allocation; systems; Unix; Unix-based; Virtual Machine;
                 virtual machines",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@InProceedings{Kofakis:1995:DPI,
  author =       "P. Kofakis and J. Louis",
  title =        "Distributed parallel implementation of seismic
                 algorithms",
  crossref =     "Hassanzadeh:1995:MMG",
  journal =      j-PROC-SPIE,
  volume =       "2571",
  pages =        "229--238",
  year =         "1995",
  CODEN =        "PSISDG",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  ISSN-L =       "0277-786X",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "METHOD Ltd., Holargos, Greece",
  classification = "A9130F (Seismic waves); A9365 (Data and information;
                 A9385 (Instrumentation and techniques for geophysical,
                 hydrospheric and lower atmosphere research);
                 acquisition, processing, storage and dissemination in
                 geophysics); C1180 (Optimisation techniques); C4130
                 (Interpolation and function approximation); C4170
                 (Differential equations); C5260 (Digital signal
                 processing); C6110P (Parallel programming); C6150N
                 (Distributed systems software); C7340 (Geophysics
                 computing)",
  fjournal =     "Proceedings of the SPIE --- The International Society
                 for Optical Engineering",
  keywords =     "Distributed parallel implementation; Eikonical
                 equations; Fermat's principle; Finite difference
                 extrapolation; First arrival; Heterogeneous
                 workstations; Irregular grids; Minimum time ray-tracer;
                 Parallel virtual machine; Seismic algorithms; Seismic
                 waves; Travel times",
  thesaurus =    "Distributed memory systems; Extrapolation; Finite
                 difference methods; Geophysical signal processing;
                 Local area networks; Minimisation; Parallel algorithms;
                 Ray tracing; Seismic waves",
}

@Article{Koski:1995:STL,
  author =       "Kimmo Koski",
  title =        "A step towards large scale parallelism: {Building} a
                 parallel computing environment from heterogeneous
                 resources",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "11",
  number =       "4--5",
  pages =        "491--498",
  month =        aug,
  year =         "1995",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jul 15 09:06:06 MDT 2005",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Parallel/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  affiliation =  "Centre for Sci. Comput., Espoo, Finland",
  classification = "C0200 (General computer topics); C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6150C (Compilers, interpreters and other processors);
                 C6150N (Distributed systems software)",
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "Center for Scientific Computing; Competition; Computer
                 industry; Cray Future Generation MPP system; Efficient
                 resource use; Heterogeneous resources; IBM SP2
                 distributed memory system; Large-scale parallelism;
                 Load balancing; Massively parallel processing;
                 Metacomputing; Parallel compiler technology; Parallel
                 computing environment; Parallel programming; Parallel
                 shared memory systems; Parallel tools selection; PVM
                 clusters; RISC processors; Risks; Supported software
                 tools; User base training; Vector system vendors",
  pubcountry =   "Netherlands",
  thesaurus =    "Cray computers; DP industry; Parallel processing;
                 Parallelising compilers; Reduced instruction set
                 computing; Resource allocation; Software tools;
                 Training",
}

@Article{Kumar:1995:MWD,
  author =       "S. Kumar and H. Adeli",
  title =        "Minimum weight design of large structures on a network
                 of workstations",
  journal =      j-MICROCOMP-CIVIL-ENG,
  volume =       "10",
  number =       "6",
  pages =        "423--432",
  month =        nov,
  year =         "1995",
  CODEN =        "MCENE7",
  ISSN =         "0885-9507",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Civil Eng., Ohio State Univ., Columbus, OH,
                 USA",
  classification = "C1180 (Optimisation techniques); C4240P (Parallel
                 programming and algorithm theory); C5620L (Local area
                 networks); C6150N (Distributed systems software); C7440
                 (Civil and mechanical engineering computing)",
  fjournal =     "Microcomputers in Civil Engineering",
  keywords =     "Coarse-grained applications; Computational capability;
                 Distributed algorithm; Generic algorithms; Granularity;
                 Large structures; Local area networks; Low cost;
                 Message passing; Microprocessors; Minimum weight
                 design; Optimization; Parallel Virtual Machine;
                 Performance estimates; Software library; Structural
                 optimization; Workstation network",
  thesaurus =    "Distributed algorithms; Genetic algorithms; Local area
                 networks; Message passing; Software libraries;
                 Structural engineering computing",
}

@InProceedings{Leung:1995:EPE,
  author =       "K.-C. Leung and M. Hamdi",
  title =        "Evaluating {PVM} and {Express} on Various Network
                 Clusters",
  crossref =     "Alnuweiri:1995:PHF",
  pages =        "57--66",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Li:1995:CPP,
  author =       "Liwei Li and Paul S. Wang",
  title =        "The {CL-PVM} Package",
  journal =      j-SIGSAM,
  volume =       "29",
  number =       "3--4",
  pages =        "2--8",
  month =        dec,
  year =         "1995",
  CODEN =        "SIGSBZ",
  ISSN =         "0163-5824 (print), 1557-9492 (electronic)",
  ISSN-L =       "0163-5824",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "Dept. of Math. and Comput. Sci., Kent State Univ., OH,
                 USA",
  fjournal =     "SIGSAM Bulletin",
  keywords =     "artificial intelligence systems; C programs; CL-PVM
                 package; Common Lisp interface; console program; expert
                 systems; Fortran 77 interface; hostfile;
                 knowledge-based systems; library functions; LISP; Lisp
                 top level; Lisp-based; machines; open systems;
                 operating systems (computers); parallel; Parallel
                 Virtual Machine; parallel/concurrent computing
                 facility; programming; programs; PVM Library routines;
                 run-time server; software libraries; software package;
                 software packages; symbolic computation systems;
                 virtual",
  treatment =    "P Practical",
}

@Article{Lin:1995:DNC,
  author =       "Mengjou Lin and J. Hsieh and D. H. C. Du and J. P.
                 Thomas and J. A. MacDonald",
  title =        "Distributed network computing over local {ATM}
                 networks",
  journal =      j-IEEE-J-SEL-AREAS-COMMUN,
  volume =       "13",
  number =       "4",
  pages =        "733--748",
  month =        may,
  year =         "1995",
  CODEN =        "ISACEM",
  DOI =          "https://doi.org/10.1109/49.382163",
  ISSN =         "0733-8716 (print), 1558-0008 (electronic)",
  ISSN-L =       "0733-8716",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Adv. Technol. Group, Apple Comput. Inc., Cupertino,
                 CA, USA",
  classification = "B0290H (Linear algebra); B0290P (Differential
                 equations); B6150M (Protocols); B6210L (Computer
                 communications); B6230 (Switching centres and
                 equipment); C4140 (Linear algebra); C4170 (Differential
                 equations); C5220P (Parallel architecture); C5620L
                 (Local area networks); C5640 (Protocols); C5670
                 (Network performance); C6150J (Operating systems)",
  fjournal =     "IEEE Journal on Selected Areas in Communications",
  keywords =     "Application programming interfaces; ASX-100 ATM
                 switch; Asynchronous transfer mode; ATM API; BSD socket
                 programming interface; Communication performance;
                 Communication protocol layer; Distributed network
                 computing; Distributed programming; End-to-end
                 communication; Fore Systems; High-speed local area
                 networks; High-speed network standards; Local ATM
                 network; Message passing library; Parallel matrix
                 multiplication; Parallel virtual machine; Performance
                 characteristics; Processors; Remote procedure call;
                 Workstations",
  thesaurus =    "Application program interfaces; Asynchronous transfer
                 mode; Local area networks; Matrix multiplication;
                 Partial differential equations; Performance evaluation;
                 Pipeline processing; Protocols; Remote procedure
                 calls",
}

@Article{Liu:1995:WCD,
  author =       "Xiaomao Liu",
  title =        "Workstations cluster for distributed supercomputing",
  journal =      j-MINI-MICRO-SYSTEMS,
  volume =       "16",
  number =       "2",
  pages =        "45--52",
  month =        feb,
  year =         "1995",
  CODEN =        "XWJXEH",
  ISSN =         "1000-1220",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "North China Inst. of Comput. Technol., Beijing,
                 China",
  classification = "C5620L (Local area networks); C6150N (Distributed
                 systems software)",
  fjournal =     "Mini-Micro Systems",
  keywords =     "Active message communication; Distributed
                 supercomputing; Global UNIX; MPI; Workstations
                 cluster",
  language =     "Chinese",
  pubcountry =   "China",
  thesaurus =    "Distributed processing; Local area networks",
}

@InProceedings{Lou:1995:PIN,
  author =       "J. Z. Lou",
  title =        "A parallel incompressible {Navier--Stokes} solver with
                 multigrid iterations",
  crossref =     "Bailey:1995:PSS",
  pages =        "167--168",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Jet Propulsion Lab., California Inst. of Technol.,
                 Pasadena, CA, USA",
  classification = "A0230 (Function theory, analysis); A0260 (Numerical
                 approximation and analysis); A0270 (Computational
                 techniques); A4710 (General fluid dynamics theory,
                 simulation and other computational methods); C4170
                 (Differential equations); C4240P (Parallel programming
                 and algorithm theory); C6110P (Parallel programming);
                 C7320 (Physics and chemistry computing)",
  keywords =     "Domain-decomposition strategy; Efficient
                 finite-difference incompressible Navier--Stokes fluid;
                 Intel Delta; Intel Paragon; Message-passing; Multigrid
                 iterations; Multigrid scheme; Parallel incompressible
                 Navier--Stokes solver; Rectangular processor meshes;
                 Second-order projection method; Staggered grid;
                 Template code",
  thesaurus =    "Finite difference methods; Message passing;
                 Navier--Stokes equations; Parallel algorithms; Physics
                 computing",
}

@InProceedings{Ludwig:1995:PPF,
  author =       "T. Ludwig and S. Lamberts",
  title =        "{PFSLib} --- a parallel file system for workstation
                 clusters",
  crossref =     "Malyshkin:1995:PCT",
  pages =        "246--251",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6150N (Distributed systems software)",
  keywords =     "Parallel file system; Parallel programming
                 environments; Performance results; PFS; PFSLib library;
                 Workstation clusters",
  thesaurus =    "File organisation; Parallel programming; Software
                 performance evaluation",
}

@InProceedings{Lumsdaine:1995:WIM,
  author =       "A. Lumsdaine and J. M. Squyres and M. W. Reichelt",
  title =        "Waveform iterative methods for parallel solution of
                 initial value problems",
  crossref =     "IEEE:1995:PSP",
  pages =        "88--97",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci. and Eng., Notre Dame Univ., IN,
                 USA",
  classification = "C4130 (Interpolation and function approximation);
                 C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory); C5440
                 (Multiprocessing systems)",
  keywords =     "Communication latency; Differential equations;
                 Differential-algebraic equations; Initial value
                 problems; Linear system; Message-passing; MOSFET
                 simulation; MPI-based implementation; Parallel
                 solution; Semiconductor device simulation program;
                 Synchronization; Time dependent semiconductor
                 drift-diffusion equations; Waveform iterative methods;
                 Waveform relaxation; Workstations",
  thesaurus =    "Circuit analysis computing; Differential equations;
                 Digital simulation; Initial value problems; Iterative
                 methods; Message passing; Parallel algorithms",
}

@InProceedings{Manke:1995:MPP,
  author =       "J. W. Manke and J. C. Patterson",
  title =        "Message passing performance of {Intel Paragon}, {IBM
                 SP1} and {CRAY T3D} using {PVM}",
  crossref =     "Bailey:1995:PSS",
  pages =        "768--769",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Boeing Comput. Services, Seattle, WA, USA",
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6150N (Distributed systems
                 software)",
  corpsource =   "Boeing Comput. Services, Seattle, WA, USA",
  keywords =     "all-to-all communication; All-to-all communication;
                 Cray computers; CRAY T3D; distributed applications;
                 Distributed applications; IBM computers; IBM SP1; Intel
                 Paragon; message passing; Message passing performance;
                 message passing time model; Message passing time model;
                 MPP machines; multiprocessing systems; nodes; Nodes;
                 performance; performance evaluation; PVM; recursive
                 doubling; Recursive doubling; scalability; Scalability;
                 speed; Speed; timing",
  thesaurus =    "Cray computers; IBM computers; Message passing;
                 Multiprocessing systems; Performance evaluation;
                 Timing",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Mantovani:1995:HPS,
  author =       "M. L. Mantovani and M. Malagoli",
  title =        "Highly parallel {SCF} calculation: the {SYSMO}
                 Program",
  crossref =     "IEEE:1995:PEW",
  pages =        "502--507",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "CICAIA, Modena Univ., Italy",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C7320 (Physics and chemistry computing)",
  keywords =     "Highly parallel SCF calculation; Linear Combination of
                 Atomic Orbitals Self Consistent Field algorithm;
                 Parallel implementation; Parallel Virtual Machine;
                 Scalability; Single program multiple data level; SYSMO
                 program; System Modena",
  thesaurus =    "Chemistry; Chemistry computing; LCAO calculations;
                 Parallel algorithms; SCF calculations",
}

@InProceedings{Martin:1995:DPC,
  author =       "I. Martin and J. C. Fabero and F. Tirado and A.
                 Bautista",
  title =        "Distributed Parallel Computers versus {PVM} on a
                 Workstation Cluster in the Simulation of Time Dependent
                 Partial Differential Equations",
  crossref =     "IEEE:1995:PEW",
  pages =        "20--26",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. de Inf. y Autom., Univ. Complutense de Madrid,
                 Spain",
  classification = "A0270 (Computational techniques); A0340K (Waves and
                 wave propagation: general mathematical aspects); C4170
                 (Differential equations); C6110P (Parallel
                 programming)",
  corpsource =   "Dept. de Inf. y Autom., Univ. Complutense de Madrid,
                 Spain",
  keywords =     "distributed parallel computers; equation; finite
                 difference method; large-scale problems; message
                 passing; numerical simulation; parallel; parallel
                 algorithms; parallel computing; parallel machine;
                 partial differential equations; performance evaluation;
                 PVM; Schr{\"o}dinger; Schr{\"o}dinger equation;
                 Sparc-stations; time dependent; wave equations;
                 workstation cluster; Workstation cluster; PVM;
                 Distributed parallel computers; Time dependent; Partial
                 differential equations; Parallel numerical simulation;
                 Schr{\"o}dinger equation; Message passing parallel
                 machine; Sparc-stations; Finite difference method;
                 Parallel computing; Large-scale problems",
  sponsororg =   "Euromicro; Assoc.Italiana per Inf. Calcolo Autom",
  thesaurus =    "Parallel algorithms; Performance evaluation;
                 Schr{\"o}dinger equation; Wave equations",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Matise:1995:PCG,
  author =       "T. C. Matise and M. D. Schroeder and D. M. Chiarulli
                 and D. E. Weeks",
  title =        "Parallel Computation of Genetic Likelihoods Using
                 {CRI-MAP}, {PVM}, and a Network of Distributed
                 Workstations",
  journal =      j-HUMAN-HEREDITY,
  volume =       "45",
  number =       "2",
  pages =        "103--??",
  month =        "????",
  year =         "1995",
  CODEN =        "HUHEAS",
  ISSN =         "0001-5652",
  ISSN-L =       "0001-5652",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Human Heredity",
}

@Article{Mattson:1995:PEP,
  author =       "Timothy G. Mattson",
  title =        "Programming Environments for Parallel and Distributed
                 Computing: a Comparison of {P4}, {PVM}, {Linda}, and
                 {TCGMSG}",
  journal =      j-IJSAHPC,
  volume =       "9",
  number =       "2",
  pages =        "138--161",
  month =        "Summer",
  year =         "1995",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib; UnCover
                 library database",
  abstract =     "Parallel programmers must choose from a confusing
                 array of parallel programming environments. When
                 success has to be measured by the success of
                 application-software development rather than
                 theoretical research, the choice must be made quickly
                 without the luxury of experimentation. In this paper,
                 we help the programmer make this choice by looking
                 closely at four of the most heavily used portable
                 programming environments --- p4, PVM, TCGMSG, and
                 Linda. For each of these programming environments, we
                 look at three different programs: one that computes
                 \$pi by numerical integration and two that benchmark
                 communication performance. The four programming
                 environments are analyzed in terms of performance,
                 support, ease of coding, and ease of debugging.",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Corp",
  affiliationaddress = "Beaverton, OR, USA",
  classification = "722.3; 722.4; 723.1; 723.5; 921.6; C0310F (Software
                 development management); C6110P (Parallel programming);
                 C6115 (Programming support); C6150N (Distributed
                 systems software)",
  corpsource =   "Div. of Supercomputer Syst., INTEL Corp., Beaverton,
                 OR, USA",
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
  journalabr =   "Int J Supercomput Appl High Perform Comput",
  keywords =     "Application software development; application software
                 development; communication; Communication benchmarks;
                 Communication performance; Computer software; Data
                 communication systems; Distributed computer systems;
                 Distributed computing; distributed computing;
                 Integration; Linda; Numerical integration; numerical
                 integration; P4; p4; parallel; Parallel processing
                 systems; Parallel programmers; parallel programmers;
                 parallel programming; Parallel programming
                 environments; Parallel virtual machine; performance; Pi
                 calculation; pi calculation; portability; Portable
                 programming environments; portable programming
                 environments; Program debugging; Programming
                 environments; programming environments; PVM; software;
                 Software engineering; software reviews; TCGMSG",
  thesaurus =    "Parallel programming; Programming environments;
                 Software portability; Software reviews",
  treatment =    "P Practical",
}

@InProceedings{Mehra:1995:AIM,
  author =       "P. Mehra and B. {Van Voorst} and J. Yan",
  title =        "Automated Instrumentation, Monitoring and
                 Visualization of {PVM} Programs",
  crossref =     "Bailey:1995:PSS",
  pages =        "832--837",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Recom Technol. Inc., NASA Ames Res. Center, Moffett
                 Field, CA, USA",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Recom Technol. Inc., NASA Ames Res. Center, Moffett
                 Field, CA, USA",
  keywords =     "Automated; automated instrumentation; Automated
                 instrumentation; Automated Instrumentation and
                 Monitoring System; data visualisation; Instrumentation
                 and Monitoring System; message passing; message-passing
                 parallel programs; Message-passing parallel programs;
                 monitoring; parallel programming; parallel programs;
                 Parallel programs; Parallel Virtual Machine;
                 performance-debugging; Performance-debugging toolkit;
                 program debugging; program monitoring; Program
                 monitoring; program visualization; Program
                 visualization; programming environments; PVM programs;
                 system; toolkit; visual programming; workstation
                 networks; Workstation networks",
  thesaurus =    "Data visualisation; Message passing; Parallel
                 programming; Program debugging; Programming
                 environments; System monitoring; Visual programming",
  treatment =    "P Practical",
}

@Article{Michielse:1995:PMU,
  author =       "Peter Michielse",
  title =        "Parallel multigrid using {PVM}",
  journal =      j-APPL-NUM-MATH,
  volume =       "19",
  number =       "1-2",
  pages =        "63--69",
  month =        nov,
  year =         "1995",
  CODEN =        "ANMAEL",
  ISSN =         "0168-9274 (print), 1873-5460 (electronic)",
  ISSN-L =       "0168-9274",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper discusses an implementation of a parallel
                 multigrid method using Parallel Virtual Machine (PVM).
                 The basics described here apply to general parallel
                 computers, either with shared memory or distributed
                 memory systems. The actual implementation has been
                 performed on both type of systems, although we will
                 focus on distributed memory systems in this paper. The
                 distributed memory implementation has been done using
                 PVM on Convex MetaSeries machines. The speed-up results
                 will be analyzed with respect to computational work and
                 communicational overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "CONVEX Computer Corp",
  affiliationaddress = "Utrecht, Neth",
  classification = "512.1.1; 721.1; 722.1; 722.4; 723.5; 921.6; C4170
                 (Differential equations); C4240P (Parallel programming
                 and algorithm theory); C5440 (Multiprocessing systems);
                 C7310 (Mathematics computing)",
  corpsource =   "CONVEX Comput. Corp., Utrecht, Netherlands",
  fjournal =     "Applied Numerical Mathematics: Transactions of IMACS",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01689274",
  journalabr =   "Appl Numer Math",
  keywords =     "Approximation theory; communicational overhead;
                 Computational methods; computational work; Computer
                 simulation; Convex MetaSeries; Data storage equipment;
                 differential equations; Distributed computer systems;
                 distributed memory systems; Distributed memory systems;
                 machines; mathematics computing; Numerical methods;
                 parallel algorithms; parallel multigrid; Parallel
                 multigrid; Parallel processing systems; parallel
                 virtual machine; Parallel virtual machine; Petroleum
                 reservoirs; Reservoir simulators; shared memory; Shared
                 memory systems; systems",
  pubcountry =   "Netherlands",
  treatment =    "A Application; P Practical",
}

@Article{Mirvis:1995:HML,
  author =       "Y. Mirvis and F. Abdi and B. Lajevardi and P. Murthy",
  title =        "Hierarchical multi-level optimization solution for
                 massive parallel simulation of composite system",
  journal =      j-AIAA-ASME-ASCE-AHS-STRUCT-STRUCT-DYN-MAT-CONF,
  volume =       "4",
  month =        "????",
  year =         "1995",
  CODEN =        "CPSCDO",
  ISSN =         "0273-4508",
  bibdate =      "Fri May 24 09:58:00 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Multilevel optimization techniques have been utilized
                 to speed up simulation processing time for the analysis
                 and modeling of high temperature composite structures.
                 The Network Computing System (NCS) tools were utilized
                 using the GENOA-Maestro, and Parallel Virtual Machine
                 (PVM) toolkits for heterogeneous distributed computing,
                 to make it portable across the range of parallel
                 hardware architecture.",
  acknowledgement = ack-nhfb,
  affiliation =  "Alpha STAR Research Corp",
  affiliationaddress = "Los Angeles, CA, USA",
  classification = "408.1; 415.4; 721.1; 722.4; 723.5; 731.1",
  conference =   "Proceedings of the 36th AIAA\slash ASME\slash
                 ASCE\slash AHS\slash ASC Structures, Structural
                 Dynamics, and Materials Conference and AIAA\slash ASME
                 Adaptive Structures Forum. Part 4 (of 5)",
  fjournal =     "AIAA/ASME/ASCE/AHS Structures, Structural Dynamics \&
                 Materials Conference --- Collection of Technical
                 Papers",
  journalabr =   "AIAA ASME ASCE AHS Struct Struct Dyn Mater Conf
                 Collect Tech Pap",
  keywords =     "Composite structures; Computational complexity;
                 Computer aided analysis; Computer simulation;
                 Constraint theory; Hierarchical systems; Mathematical
                 models; Multilevel optimization technique; Network
                 computing system (NCS); Optimization; Parallel
                 processing systems; Parallel virtual machine (PVM);
                 Software package GENOA-Maestro; Software package PVM;
                 Structural analysis",
  meetingdate =  "Apr 10--13 1995",
}

@InProceedings{Morinishi:1995:PIB,
  author =       "K. Morinishi and N. Satofuka",
  title =        "Parallel implementation of the {Boltzmann} equation
                 solvers using {PVM}",
  crossref =     "Satofuka:1995:PCF",
  pages =        "339--346",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A4710 (General fluid dynamics theory, simulation and
                 other computational methods); A4745 (Rarefied gas
                 dynamics); C4180 (Integral equations); C4240P (Parallel
                 programming and algorithm theory); C7320 (Physics and
                 chemistry computing); C7460 (Aerospace engineering
                 computing)",
  corpsource =   "Dept. of Mech. and Syst. Eng., Kyoto Inst. of
                 Technol., Japan",
  keywords =     "aerodynamics; aerospace; aerospace computing; BGK
                 model; Boltzmann; Boltzmann collision integral;
                 Boltzmann equation; CFD; computational; computing;
                 digital simulation; equation solver; external flows;
                 flow simulation; fluid dynamics; message; message
                 passing software; NACA0012 airfoil; parallel
                 programming; passing; physics computing; PVM; rarefied;
                 rarefied gas flow",
  pubcountry =   "Netherlands",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Mork:1995:DPP,
  author =       "P. Mork",
  title =        "Debugging parallel programs with execution tracing",
  crossref =     "Ferenczi:1995:PAH",
  pages =        "176--183",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Control Eng., Miskolc Univ., Hungary",
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6130B (Graphics techniques); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems software)",
  keywords =     "CASE tool; Converter; Datafile; Execution tracing;
                 Graphical tool; PACVIS; ParaGraph; Parallel program
                 debugging; Parallel software engineering; Pvm3; Raw
                 trace data transformation; SEPP project; Software
                 Engineering for Parallel Processing project; Tools;
                 Trace file visualization; Visualization program",
  thesaurus =    "Computer aided software engineering; Data
                 visualisation; Parallel programming; Program debugging;
                 Software tools",
}

@Article{Morton:1995:LLP,
  author =       "Don Morton and Kefei Wang and David O. Ogbe",
  title =        "Lessons learned in porting {Fortran\slash PVM} code to
                 the {Cray T3D}",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "3",
  number =       "1",
  pages =        "4--11",
  month =        "Spring",
  year =         "1995",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/88.384580",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "With an extra work from the programmer, the Cray T3D
                 offers low-level facilities for achieving substantial
                 performance gains. Because of this, it is often
                 necessary to consider the tradeoffs between performance
                 gains and coding effort. Here, provided is a first-hand
                 account of the issues in porting Fortran\slash PVM code
                 to the Cray T3D. As a new evolving product, occasional
                 problems with the T3D should be expected.",
  acknowledgement = ack-nhfb,
  affiliation =  "Cameron Univ",
  affiliationaddress = "Lawton, OK, USA",
  classification = "722.2; 722.3; 722.4; 723.1; 723.1.1; 723.2; C5440
                 (Multiprocessing systems); C6110B (Software engineering
                 techniques); C6110P (Parallel programming); C6140D
                 (High level languages)",
  fjournal =     "IEEE parallel and distributed technology: systems and
                 applications",
  journalabr =   "IEEE Parallel Distrib Technol",
  keywords =     "Algorithms; Central processing unit; Central
                 processing unit, Fortran/PVM code porting; Code
                 porting; Codes (symbols); Coding effort; coding effort;
                 Coding effort; coding effort; Computer aided software
                 engineering; Computer software portability; Computer
                 workstations; Cray computers; Cray T3D; Cray T3D
                 computer; Data communication systems; Distributed
                 computer systems; FORTRAN; Fortran (programming
                 language); FORTRAN (programming language); Fortran/PVM
                 code porting; Interfaces (computer); low-level
                 facilities; Low-level facilities; low-level facilities;
                 message passing; parallel machines; parallel
                 programming; Parallel virtual machine; performance
                 gains; Performance gains; performance gains; Program
                 debugging; software portability; Software prototyping;
                 Subroutines; Supercomputers",
  thesaurus =    "Cray computers; FORTRAN; Message passing; Parallel
                 machines; Parallel programming; Software portability",
  treatment =    "P Practical",
}

@InProceedings{Nguyen:1995:SPI,
  author =       "D. Nguyen and B. Hillberg",
  title =        "Simulations of Pinhole Imaging for {AXAF}: Distributed
                 Processing Using the {MPI} Standard",
  crossref =     "Shaw:1995:ADA",
  pages =        "361--366 (or 361--363??)",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A9575P (Mathematical and computer techniques in
                 astronomy); C6110P (Parallel programming)C6185
                 (Simulation techniques); C6150N (Distributed systems
                 software); C7350 (Astronomy and astrophysics
                 computing)",
  conftitle =    "Astronomical Data Analysis Software and Systems IV
                 Meeting",
  corpsource =   "Smithsonian Astrophys. Obs., Cambridge, MA, USA",
  keywords =     "Application Programming Interface; astronomy
                 computing; AXAF mission; digital simulation;
                 distributed processing; LAM programming environment;
                 Local Area Multicomputer; memory intensive task;
                 message passing; Message Passing Interface; MPI
                 standard; Ohio Supercomputer Center; parallel mode
                 simulation; parallel processing; parallel programming;
                 pinhole imaging; pinhole simulation program; sequential
                 mode simulation; software packages; software
                 portability; workstation cluster; X-ray astronomy",
  treatment =    "T Theoretical or Mathematical; A Application",
}

@Article{Novotny:1995:BPP,
  author =       "Mark Novotny",
  title =        "{BOOKS}: {PVM} --- Parallel Virtual Machine: a Users'
                 Guide and Tutorial for Networked Parallel Computing",
  journal =      j-COMPUT-PHYS,
  volume =       "9",
  number =       "6",
  pages =        "607--??",
  month =        "????",
  year =         "1995",
  CODEN =        "CPHYE2",
  ISSN =         "0894-1866 (print), 1558-4208 (electronic)",
  ISSN-L =       "0894-1866",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers in Physics",
}

@Article{Novotny:1995:BRA,
  author =       "Mark Novotny and Susan McKay and Wolfgang Christian",
  title =        "Book Review: {Al Geist, Adam Beguelin, Jack Dongarra,
                 Weicheng Jiang, Robert Manchek, and Vaidy Sunderam,
                 \booktitle{{PVM} --- Parallel Virtual Machine: a Users'
                 Guide and Tutorial for Networked Parallel Computing}}",
  journal =      j-COMPUT-PHYS,
  volume =       "9",
  number =       "6",
  pages =        "607--??",
  month =        nov,
  year =         "1995",
  CODEN =        "CPHYE2",
  DOI =          "https://doi.org/10.1063/1.4823450",
  ISSN =         "0894-1866 (print), 1558-4208 (electronic)",
  ISSN-L =       "0894-1866",
  bibdate =      "Wed Apr 10 08:45:57 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/computphys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://aip.scitation.org/doi/10.1063/1.4823450",
  acknowledgement = ack-nhfb,
  ajournal =     "Comput. Phys",
  fjournal =     "Computers in Physics",
  journal-URL =  "https://aip.scitation.org/journal/cip",
}

@InProceedings{Nupairoj:1995:PES,
  author =       "N. Nupairoj and L. M. Ni",
  title =        "Performance evaluation of some {MPI} implementations
                 on workstation clusters",
  crossref =     "IEEE:1995:PSP",
  pages =        "98--105",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6110B (Software engineering
                 techniques); C6150N (Distributed systems software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  keywords =     "communication library; Communication library;
                 distributed memory systems; distributed-memory
                 computing systems; Distributed-memory computing
                 systems; high performance computing; High performance
                 computing; message passing; Message Passing Interface;
                 message-passing; Message-passing; MPI implementations;
                 MPI specification; performance evaluation; Performance
                 evaluation; software libraries; standards; workstation
                 clusters; Workstation clusters; workstations",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Distributed memory systems; Message passing;
                 Performance evaluation; Software libraries; Standards;
                 Workstations",
  treatment =    "P Practical",
}

@Article{Oakley:1995:ADR,
  author =       "D. R. Oakley and N. F. {Knight, Jr.} and D. D.
                 Warner",
  title =        "Adaptive dynamic relaxation algorithm for non-linear
                 hyperelastic structures. {III}. {Parallel}
                 implementation",
  journal =      j-COMPUT-METH-APPL-MECH-ENG,
  volume =       "126",
  number =       "1-2",
  pages =        "111--129",
  month =        sep,
  year =         "1995",
  CODEN =        "CMMECC",
  ISSN =         "0045-7825, 0374-2830",
  ISSN-L =       "0045-7825",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Appl. Res. Assoc. Inc., Raleigh, NC, USA",
  classification = "C4185 (Finite element analysis); C4240P (Parallel
                 programming and algorithm theory); C5440
                 (Multiprocessing systems); C6150N (Distributed systems
                 software); C7440 (Civil and mechanical engineering
                 computing)",
  fjournal =     "Computer Methods in Applied Mechanics and
                 Engineering",
  keywords =     "128-Processor Intel hypercube; Adaptive dynamic
                 relaxation algorithm; Efficient parallel
                 implementation; Engineering workstation cluster;
                 Frictionless contact; Interprocessor communication;
                 Nonlinear hyperelastic structures; Nonlinear static
                 analysis; Parallel-processing resource; PVM; Relative
                 speedups; Scalability; Three-dimensional hyperelastic
                 systems; Two-dimensional hyperelastic systems",
  pubcountry =   "Netherlands",
  thesaurus =    "Adaptive systems; Elasticity; Engineering
                 workstations; Finite element analysis; Hypercube
                 networks; Local area networks; Parallel algorithms;
                 Relaxation; Structural engineering computing",
}

@InProceedings{Olszewski:1995:TCC,
  author =       "Luke Olszewski",
  title =        "A timing comparison of the conjugate gradient and
                 {Gauss--Seidel} parallel algorithms in a
                 one-dimensional flow equation using {PVM}",
  crossref =     "ACM:1995:PAS",
  pages =        "205--212",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The development of parallel processing came about due
                 to the ineffectiveness of a single processor to
                 accommodate the solutions of large scale problems in a
                 reasonable amount of time. In this paper, we shall
                 introduce one such problem, and discuss the
                 implementation of two parallel algorithms applied to
                 the linear approximations. This study will illustrate
                 how an approximation method which has a faster rate of
                 convergence may not necessarily produce the best
                 solution time.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Georgia Southern
                 Univ.",
  affiliationaddress = "Statesboro, GA, USA",
  classification = "631.1; 722.4; 723.1; 723.5; 921.1; 921.6; C4130
                 (Interpolation and function approximation); C4140
                 (Linear algebra); C4240P (Parallel programming and
                 algorithm theory); C7310 (Mathematics computing); C7340
                 (Geophysics computing)",
  conference =   "Proceedings of the 33rd Annual Southeast Conference",
  corpsource =   "Dept. of Math. and Comput. Sci., Georgia Southern
                 Univ., Statesboro, GA, USA",
  journalabr =   "Proc Annu Southeast Conf",
  keywords =     "Approximation theory; approximations; Computational
                 methods; Conjugate gradient; conjugate gradient;
                 conjugate gradient methods; Conjugate gradient parallel
                 algorithms; Convergence; convergence; convergence of
                 numerical; Convergence of numerical methods; Flow of
                 fluids; Gauss Seidel parallel algorithms; Gauss--Seidel
                 parallel algorithm; geophysics computing; hydrology;
                 iterative methods; Large scale problems; large scale
                 problems; Large scale systems; linear; Linear
                 approximations; Mathematical models; mathematics
                 computing; methods; Numerical methods; One dimensional
                 flow equation; One-dimensional flow equation;
                 one-dimensional flow equation; Parallel algorithms;
                 parallel algorithms; Parallel processing; parallel
                 processing; Parallel processing systems; Parallel
                 virtual machine; PVM; Richards equation; Timing;
                 timing; Timing comparison",
  meetingaddress = "Clemson, CA, USA",
  meetingdate =  "Mar 17--18 1995",
  meetingdate2 = "03/17--18/95",
  sponsororg =   "ACM",
  thesaurus =    "Conjugate gradient methods; Convergence of numerical
                 methods; Geophysics computing; Hydrology; Iterative
                 methods; Mathematics computing; Parallel algorithms",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Ouenes:1995:PRA,
  author =       "A. Ouenes and W. W. Weiss and J. A. Sultan and J.
                 Anwar",
  title =        "Parallel Reservoir Automatic History Matching Using a
                 Network of Workstations and {PVM}",
  crossref =     "Anonymous:1995:RSS",
  pages =        "125--134",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Panda:1995:GRW,
  author =       "D. K. Panda",
  title =        "Global reduction in wormhole k-ary n-cube networks
                 with multidestination exchange worms",
  crossref =     "IEEE:1995:PIP",
  pages =        "652--659",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. and Inf. Sci., Ohio State Univ.,
                 Columbus, OH, USA",
  classification = "C4230M (Multiprocessor interconnection); C5220P
                 (Parallel architecture); C5440 (Multiprocessing
                 systems)",
  keywords =     "Barrier synchronization operations; Communication
                 startup time; Complete global reduction; Data size;
                 Fast global reduction; Global reduction; Message
                 passing interface standard; Multidestination exchange
                 worms; Multidestination message passing mechanism;
                 Pairwise exchange worms; System size; Unicast-based
                 message passing; Wormhole k-ary n-cube networks",
  thesaurus =    "Hypercube networks; Message passing; Synchronisation",
}

@InProceedings{Panda:1995:IDE,
  author =       "D. K. Panda",
  title =        "Issues in designing efficient and practical algorithms
                 for collective communication on wormhole-routed
                 systems",
  crossref =     "Agrawal:1995:PIW",
  pages =        "8--15",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. and Inf. Sci., Ohio State Univ.,
                 Columbus, OH, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing); C6150N (Distributed systems
                 software)",
  keywords =     "Collective communication; Collective communication
                 operations; Communication types; Message Passing
                 Interface; MPI standard; Performance evaluation;
                 Practical algorithms; Scalable parallel systems;
                 Wormhole routed systems; Wormhole-routed systems",
  thesaurus =    "Message passing; Multiprocessor interconnection
                 networks; Parallel algorithms; Parallel machines",
}

@InProceedings{Pennington:1995:DHC,
  author =       "R. L. Pennington",
  title =        "Distributed and heterogeneous computing",
  crossref =     "Vandoni:1995:CSC",
  pages =        "25--57",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Pittsburgh Supercomputing Centre, PA, USA",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  keywords =     "C; Distributed computing; Fortran; Heterogeneous
                 computing; Message passing; Programming; PVM",
  thesaurus =    "Message passing; Parallel machines; Parallel
                 programming; Virtual machines",
}

@InProceedings{Periyathamby:1995:NSG,
  author =       "U. Periyathamby and B. C. Khoo and K. S. Yeo and Q. X.
                 Wang",
  title =        "A Numerical Simulation of the Growth and Collapse of
                 Vapour Cavity Near a Free Surface on Distributed
                 Computing Through {PVM}",
  crossref =     "Bilger:1995:AFM",
  pages =        "815--818",
  year =         "1995",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Pfenning:1995:OCP,
  author =       "J{\"o}rg-Thomas Pfenning and Christoph Moll",
  title =        "Optimized communication patterns on workstation
                 clusters",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "21",
  number =       "3",
  pages =        "373--388",
  day =          "10",
  month =        mar,
  year =         "1995",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:14:24 MDT 1999",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1995&volume=21&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1995&volume=21&issue=3&aid=964",
  acknowledgement = ack-nhfb,
  affiliation =  "Koln Univ., Germany",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming); C6150N
                 (Distributed systems software)",
  corpsource =   "Koln Univ., Germany",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "Data parallel programming model; Dynamic loop
                 scheduling algorithm; FDDI-ring; High startup
                 latencies; Limited communication bandwidth; Matrix
                 multiplication; Network usage; Optimized communication
                 patterns; Parallel communications architecture; PVM
                 message passing library; Sequential communication;
                 Sparse communication patterns; Workstation clusters",
  pubcountry =   "Netherlands",
  thesaurus =    "Message passing; Parallel programming; Scheduling;
                 Workstations",
}

@Article{Piscaglia:1995:DOC,
  author =       "P. Piscaglia and B. Macq and P. Maes",
  title =        "Distributed optimization of codebooks",
  journal =      j-SIGNAL-PROCESS-IMAGE-COMMUN,
  volume =       "7",
  number =       "3",
  pages =        "211--223",
  month =        sep,
  year =         "1995",
  CODEN =        "SPICEF",
  ISSN =         "0923-5965 (print), 1879-2677 (electronic)",
  ISSN-L =       "0923-5965",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ. Catholique de Louvain, Belgium",
  classification = "B6120B (Codes); B6140C (Optical information, image
                 and video signal processing); B6210L (Computer
                 communications); C5260B (Computer vision and image
                 processing techniques); C5440 (Multiprocessing
                 systems); C5620L (Local area networks)",
  fjournal =     "Signal Processing: Image Communication",
  keywords =     "Codebooks; Codebooks optimization; Communication
                 bandwidth minimisation; Computer network; Distributed
                 optimization; Failure robustness; General-purpose
                 workstations; Image processing algorithms; LBG
                 algorithm; Load balancing; Parallel virtual machine;
                 Processors synchronisation; Specialized library",
  pubcountry =   "Netherlands",
  thesaurus =    "Image coding; Local area networks; Parallel machines;
                 Virtual machines; Workstations",
}

@InProceedings{Plank:1995:ADC,
  author =       "J. S. Plank and Youngbae Kim and J. J. Dongarra",
  title =        "Algorithm-based diskless checkpointing for fault
                 tolerant matrix operations",
  crossref =     "IEEE:1995:DPT",
  pages =        "351--360",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Tennessee Univ., TN, USA",
  classification = "C4130 (Interpolation and function approximation);
                 C4140 (Linear algebra); C6110B (Software engineering
                 techniques); C6150N (Distributed systems software);
                 C7300 (Natural sciences computing)",
  keywords =     "Algorithm-based diskless checkpointing; Cholesky
                 factorization; Distributed scientific computations;
                 Fault tolerant matrix operations; Fault-tolerance;
                 High-performance implementations; IBM SP2; Long-running
                 scientific computations; Low overhead; LU
                 factorization; Performance; Preconditioned conjugate
                 gradient; Processors; PVM networks; QR factorization;
                 SUN workstations; Workstation network platform",
  thesaurus =    "Conjugate gradient methods; Local area networks;
                 Matrix algebra; Natural sciences computing; Software
                 fault tolerance; Subroutines; Workstations",
}

@InProceedings{Prasad:1995:PPB,
  author =       "S. K. Prasad and K. M. Yu",
  title =        "Performance of a {PVM-based} optimistic simulation
                 testbed on different parallel architectures",
  crossref =     "Hamza:1995:PII",
  pages =        "511--514",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6185 (Simulation
                 techniques)C7430 (Computer engineering)",
  corpsource =   "Dept. of Math. and Comput. Sci., Georgia State Univ.,
                 Atlanta, GA, USA",
  keywords =     "architectures; bus-based shared-memory; discrete event
                 simulation; dynamic time; hypercube networks;
                 hypercube-based parallel computer; local-memory;
                 message granularity; multiprocessor; nCUBE-II;
                 optimistic discrete event simulation testbed; Parallel;
                 parallel; performance evaluation; PVM-based optimistic
                 simulation testbed; RS-6000; shared memory; Silicon
                 Graphics 4D/GTX; systems; Unix workstations; Virtual
                 Machine package; virtual machines; window",
  sponsororg =   "IASTED; ISMM",
  treatment =    "X Experimental",
}

@InProceedings{Puskas:1995:LBW,
  author =       "Z. Puskas",
  title =        "Load Balancing on Workstation Clusters Using {PVM}",
  crossref =     "Ferenczi:1995:PAH",
  pages =        "112--123",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Meas. and Instrum. Eng., Tech. Univ.
                 Budapest, Hungary",
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Meas. and Instrum. Eng., Tech. Univ.
                 Budapest, Hungary",
  keywords =     "capacity sharing; computational; Computational
                 capacity sharing; distributed; distributed parallel
                 system; Distributed parallel system; Distributed
                 programming; load balancing; Load balancing; local area
                 networks; network operating systems; parallel; Parallel
                 Virtual Machine; processor-farm technique;
                 Processor-farm technique; programming; Programming;
                 programming models; Programming models; PVM; resource
                 allocation; workstation clusters; Workstation clusters;
                 workstations",
  pubcountry =   "Hungary",
  thesaurus =    "Local area networks; Network operating systems;
                 Parallel programming; Resource allocation;
                 Workstations",
  treatment =    "P Practical",
}

@InProceedings{Qaddouri:1995:MFS,
  author =       "A. Qaddouri and R. Roy and B. Goulard",
  title =        "Multigroup flux solvers using {PVM} [{Parallel Virtual
                 Machine}]",
  crossref =     "ANS:1995:MCR",
  volume =       "2",
  pages =        "1554--1562",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A2820H (Neutron diffusion); C7470 (Nuclear
                 engineering computing)",
  corpsource =   "Inst. de Genie Nucleaire, Ecole Polytech. de Montreal,
                 Que., Canada",
  keywords =     "collision probability; distributed memory; iterative;
                 multigroup flux solvers; neutron transport theory;
                 nuclear engineering computing; parallel processing;
                 Parallel Virtual Machine; PVM; time-independent
                 transport equation",
  sponsororg =   "ANS; Eur. Nucl. Soc.; Atomic Energy Soc. Japan",
  treatment =    "T Theoretical or Mathematical",
}

@MastersThesis{Qu:1995:FAS,
  author =       "Su Qu",
  title =        "Feature-driven area-based stereo matching method on
                 {PVM}",
  type =         "M.S. thesis",
  school =       inst-UGA,
  address =      inst-UGA:adr,
  pages =        "x + 110",
  year =         "1995",
  bibdate =      "Mon Jan 15 15:32:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Directed by Hamid R. Arabnia.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Rambu:1995:DSS,
  author =       "N. Rambu and S. Stefan and D. Borsan and S. Talpos",
  title =        "A diagnostic study of some meteorological fields
                 simulated with {UKMO} and {MPI} atmospheric general
                 circulation models",
  crossref =     "Gates:1995:PFI",
  pages =        "493--498",
  year =         "1995",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Ramon:1995:PKV,
  author =       "J. Ramon and P. Pena",
  title =        "Parallelization of {KENO-Va Monte Carlo} code",
  journal =      j-COMP-PHYS-COMM,
  volume =       "88",
  number =       "1",
  pages =        "76--82",
  month =        jul,
  year =         "1995",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/0010-4655(95)00025-B",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/001046559500025B",
  acknowledgement = ack-nhfb,
  affiliation =  "Consejo de Seguridad Nucl., Madrid, Spain",
  classification = "A2820H (Neutron diffusion); A2846E (Nuclear
                 criticality safety); A2846G (Packaging and
                 transportation of nuclear materials); C6110P (Parallel
                 programming); C7470 (Nuclear engineering computing)",
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
  keywords =     "CONVEX C3440; Criticality; Distributed memory systems;
                 FDDI network; Fuel storage pools; HP9000/735; KENO-Va
                 code; Message-passing interface; Monte Carlo code;
                 Parallelization; PVM; Random numbers; SCALE system;
                 Shared memory machines; Shipping casks; Transport
                 equation",
  pubcountry =   "Netherlands",
  thesaurus =    "Monte Carlo methods; Neutron transport theory; Nuclear
                 criticality safety; Nuclear engineering computing;
                 Nuclear materials packaging; Parallel programming",
}

@InProceedings{Ratha:1995:CUC,
  author =       "N. K. Ratha and A. K. Jain and M. J. Chung",
  title =        "Clustering using a coarse-grained parallel genetic
                 algorithm: a preliminary study",
  crossref =     "Cantoni:1995:CCA",
  pages =        "331--338",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  classification = "C1180 (Optimisation techniques); C1250 (Pattern
                 recognition); C4240P (Parallel programming and
                 algorithm theory); C6150N (Distributed systems
                 software)",
  keywords =     "Coarse grained parallel genetic algorithm;
                 Coarse-grained parallel genetic algorithm; Complex
                 optimization problems; Data sets; Distributed
                 algorithm; Distributed implementation; Divide and
                 conquer approach; GAs; Near linear speedup; Optimal
                 minimum squared error partition; Optimization problem;
                 Pattern clustering; Preliminary study; PVM; Standard
                 communication library; Standard K-means clustering
                 algorithm; Workstation cluster",
  thesaurus =    "Distributed algorithms; Divide and conquer methods;
                 Genetic algorithms; Pattern recognition; Problem
                 solving",
}

@InProceedings{Ratha:1995:DED,
  author =       "N. K. Ratha and T. Acar and M. Gokmen and A. K. Jain",
  title =        "A distributed edge detection and surface
                 reconstruction algorithm",
  crossref =     "Cantoni:1995:CCA",
  pages =        "149--154",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  classification = "B0260 (Optimisation techniques); B6140C (Optical
                 information, image and video signal processing); C1180
                 (Optimisation techniques); C1250 (Pattern recognition);
                 C4240P (Parallel programming and algorithm theory);
                 C5260B (Computer vision and image processing
                 techniques)",
  keywords =     "Associated energy functional; Cluster of workstations;
                 Distributed edge detection; Graduated non-convexity;
                 Image compression; Image restoration; Line process;
                 Optimal edge assignment; Pixel gray valves; PVM
                 communication library; Regularization techniques;
                 Scalable parallel algorithm; Surface reconstruction
                 algorithm; Weak membrane",
  thesaurus =    "Computer vision; Edge detection; Image restoration;
                 Parallel algorithms; Simulated annealing; Surface
                 reconstruction",
}

@InProceedings{Reinefeld:1995:PVE,
  author =       "A. Reinefeld and V. Schnecke",
  title =        "Portability versus efficiency? Parallel applications
                 on {PVM} and {Parix}",
  crossref =     "Fritzson:1995:PPA",
  pages =        "35--49",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C1160 (Combinatorial mathematics); C1180
                 (Optimisation techniques); C5620L (Local area
                 networks)C5440 (Multiprocessing systems); C6110B
                 (Software engineering techniques); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  corpsource =   "Center for Parallel Comput., Paderborn Univ.,
                 Germany",
  keywords =     "algorithm architecture; benchmarks; combinatorial
                 mathematics; computing; efficiency; high-level
                 programming environment; LAN; local area networks;
                 massively parallel transputer system; moderately
                 parallel Parsytec GC/PowerPlus; optimisation; parallel
                 algorithms; parallel applications; parallel
                 high-performance; parallel programming; Parix
                 programming model; performance; portability; portable
                 programming models; processors; programming
                 environments; PVM programming model; software
                 performance evaluation; software portability; system;
                 T805; transputer systems; Unix; UNIX workstation
                 cluster; workstations",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@Article{Reynders:1995:OOO,
  author =       "John V. W. Reynders and David W. Forslund and Paul J.
                 Hinker and Marydell Tholburn and David G. Kilman and
                 William F. Humphrey",
  title =        "{OOPS}: an object-oriented particle simulation class
                 library for distributed architectures",
  journal =      j-COMP-PHYS-COMM,
  volume =       "87",
  number =       "1--2",
  pages =        "212--224",
  day =          "2",
  month =        may,
  year =         "1995",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/0010-4655(94)00172-X",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 21:29:54 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/001046559400172X",
  acknowledgement = ack-nhfb,
  affiliation =  "Adv. Comput. Lab., Los Alamos Nat. Lab., NM, USA",
  classification = "A0270 (Computational techniques); A0520G (Classical
                 ensemble theory); C5220P (Parallel architecture);
                 C6110J (Object-oriented programming); C6110P (Parallel
                 programming); C7320 (Physics and chemistry computing)",
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
  keywords =     "Connection Machine CM5; CRI T3D; Distributed
                 architectures; Materials science; Object-oriented
                 particle simulation class library; OOPS; Plasma
                 physics; Porous media; Portable code; PVM clusters;
                 Suspension flows; Uniform high-level interface; Vortex
                 simulations",
  pubcountry =   "Netherlands",
  thesaurus =    "C listings; Digital simulation; Multiprocessing
                 programs; Object-oriented programming; Parallel
                 architectures; Physics computing; Software libraries;
                 Software portability; Statistical mechanics",
}

@Article{Ross:1995:DCM,
  author =       "D. L. Ross and J. S. Collins and J. H. George",
  title =        "A dynamic capacity model using concurrent processing",
  journal =      j-NEURAL-PAR-SCI-COMPUT,
  volume =       "3",
  number =       "2",
  pages =        "249--262",
  month =        jun,
  year =         "1995",
  CODEN =        "NPACEM",
  ISSN =         "1061-5369",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math., Embry-Riddle Aeronaut. Univ., Daytona
                 Beach, FL, USA",
  classification = "C1180 (Optimisation techniques); C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C7430
                 (Computer engineering); C7460 (Aerospace engineering
                 computing)",
  fjournal =     "Neural, Parallel and Scientific Computations",
  keywords =     "Airport capacity optimisation; Concurrent processing;
                 Concurrent programming; Dynamic capacity model;
                 Iterative method; National Airspace System; Optimal
                 capacity profiles; Parallel virtual machine; Public
                 domain software PVM; Time-varying index",
  thesaurus =    "Aerospace computing; Optimisation; Parallel
                 processing; Public domain software; Virtual machines",
}

@Article{Schafers:1995:TGP,
  author =       "L. Schafers and C. Scheidler and O. Kramer-Fuhrmann",
  title =        "{TRAPPER}: a graphical programming environment for
                 parallel systems",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "11",
  number =       "4-5",
  pages =        "351--361",
  month =        aug,
  year =         "1995",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Res. and Technol, Daimler-Benz AG, Berlin,
                 Germany",
  classification = "C6110P (Parallel programming); C6110V (Visual
                 programming); C6115 (Programming support); C6130B
                 (Graphics techniques); C6150G (Diagnostic, testing,
                 debugging and evaluating systems); C6150N (Distributed
                 systems software)",
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "Communicating sequential processes; Communication
                 loads; Computation loads; Configtool; Designtool;
                 Graphical programming environment; Graphical
                 representation; High computing power; Hybrid program
                 development; Industrial applications; Interprocess
                 communication; Mapping; Monitoring system;
                 Optimization; Parallel process structure; Parallel
                 systems; Software event recording; System design;
                 Target hardware configuration; Textual representations;
                 TRAPPER; Visualization",
  pubcountry =   "Netherlands",
  thesaurus =    "Communicating sequential processes; Computer
                 animation; Data visualisation; Local area networks;
                 Parallel machines; Parallel programming; Programming
                 environments; Software tools; System monitoring;
                 Transputer systems; Visual programming",
}

@InProceedings{Schuster:1995:CSM,
  author =       "G. Schuster and F. Breitenecker",
  title =        "Coupling Simulators with the Model Interconnection
                 Concept and {PVM}",
  crossref =     "Breitenecker:1995:ESC",
  pages =        "321--326",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "ARGE Simulation News, Tech. Univ. of Vienna, Austria",
  classification = "C6140D (High level languages); C6150N (Distributed
                 systems software); C6185 (Simulation techniques)",
  corpsource =   "ARGE Simulation News, Tech. Univ. of Vienna, Austria",
  keywords =     "ACSL; communication; continuous simulation; Continuous
                 simulation systems; digital simulation; message
                 passing; message passing system PVM; Message passing
                 system PVM; model interconnection concept; Model
                 interconnection concept; MOSIS; Mosis; program; Program
                 communication; PVM; simulation languages; systems",
  pubcountry =   "Netherlands",
  thesaurus =    "Digital simulation; Message passing; Simulation
                 languages",
  treatment =    "P Practical",
}

@Article{Sekharan:1995:LBM,
  author =       "Chandra N. Sekharan and Vineet Goel and R. Sridhar",
  title =        "Load balancing methods for ray tracing and binary tree
                 computing using {PVM}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "21",
  number =       "12",
  pages =        "1963--1978",
  day =          "12",
  month =        dec,
  year =         "1995",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 06 18:01:04 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1995&volume=21&issue=12;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1995&volume=21&issue=12&aid=1028",
  abstract =     "We propose efficient load balancing methods for two
                 computational problems namely ray tracing and bottom-up
                 binary tree computing in a distributed environment. In
                 the context of ray tracing, we propose a variant of a
                 static load balancing technique presented in [15] where
                 the sampling is based on partitioning the object space.
                 Our approach partitions the image instead and uses an
                 efficient scheduling technique for load balancing.
                 Computations carried out on a binary tree arise
                 naturally in image processing and network optimization
                 problems. Many of these problems are solved efficiently
                 in parallel by the popular tree contraction technique
                 [1]. In this paper, we explore the tree-contraction
                 technique in a distributed setting using the grain
                 packing method [9]. Implementations of our algorithms
                 on a cluster of workstations using Parallel Virtual
                 Machine (PVM) [6] demonstrate near-perfect load
                 balancing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Loyola Univ of Chicago",
  affiliationaddress = "Chicago, IL, USA",
  classification = "721.1; 722.3; 722.4; 723.1; 723.2; 921.4; C1160
                 (Combinatorial mathematics); C4240P (Parallel
                 programming and algorithm theory); C6130B (Graphics
                 techniques)",
  corpsource =   "Dept. of Math. and Comput. Sci., Loyola Univ.,
                 Chicago, IL, USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "Algorithms; Binary tree computing; binary tree
                 computing; Computational complexity; Computer graphics;
                 Computer networks; Computer workstations; distributed
                 algorithms; Distributed computer systems; Distributed
                 environment; Grain packing methods; Image processing;
                 Load balancing; load balancing; Machine; network
                 optimization; Optimization; Parallel processing
                 systems; Parallel Virtual; Parallel virtual machine;
                 partitioning; PVM; Ray tracing; ray tracing; Resource
                 allocation; resource allocation; Scheduling; scheduling
                 technique; Tree contraction technique; Trees
                 (mathematics); trees (mathematics); Workstation
                 cluster",
  pubcountry =   "Netherlands",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Shen:1995:PSM,
  author =       "H. Shen",
  title =        "Parallel $k$-set mutual range-join in hypercubes",
  journal =      j-MICROPROC-MICROPROG,
  volume =       "41",
  number =       "7",
  pages =        "443--448",
  month =        nov,
  year =         "1995",
  CODEN =        "MMICDT",
  ISSN =         "0165-6074 (print), 1878-7061 (electronic)",
  ISSN-L =       "0165-6074",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. and Inf. Technol., Griffith Univ.,
                 Nathan, Qld., Australia",
  classification = "C4230M (Multiprocessor interconnection); C4240P
                 (Parallel programming and algorithm theory); C5220P
                 (Parallel architecture); C5470 (Performance evaluation
                 and testing); C5670 (Network performance)",
  fjournal =     "Microprocessing and Microprogramming",
  keywords =     "Data comparisons; Hypercubes; Mutual range-join;
                 Parallel algorithm; Parallel k-set mutual range-join;
                 Performance; Permutation-based range-join; PVM; Tuples;
                 Worst case",
  pubcountry =   "Netherlands",
  thesaurus =    "Hypercube networks; Parallel algorithms; Performance
                 evaluation",
}

@InProceedings{Siegelin:1995:BPW,
  author =       "C. Siegelin and U. Finger and C. O'Donnell",
  title =        "Boosting the performance of workstations through
                 {WARPmemory}",
  crossref =     "Haridi:1995:EPP",
  pages =        "703--706",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. Inf., Ecole Nat. Superieure des Telecommun.,
                 Paris, France",
  classification = "C5310 (Storage system design); C5540 (Terminals and
                 graphic displays); C5620L (Local area networks); C6110P
                 (Parallel programming); C6120 (File organisation);
                 C6150N (Distributed systems software)",
  keywords =     "Improved workstation performance; Local network;
                 Parallel program execution; Performance optimization;
                 Physically shared memory; PVM; Running system; Serially
                 multiported memory; Standard programming interface;
                 WARPmemory; Workstation network",
  thesaurus =    "Application program interfaces; Local area networks;
                 Memory architecture; Message passing; Parallel
                 programming; Shared memory systems; Workstations",
}

@InProceedings{Silva:1995:PCR,
  author =       "L. M. Silva and J. G. Silva and S. Chapple and L.
                 Clarke",
  title =        "Portable checkpointing and recovery",
  crossref =     "IEEE:1995:PFI",
  pages =        "188--195",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. de Engenharia Inf., Coimbra Univ., Portugal",
  classification = "C6110B (Software engineering techniques); C6150J
                 (Operating systems)",
  keywords =     "Data-reconfiguration; F CHIMP/MPI; Flexible recovery
                 mechanism; Parallel library; Portability; Portable
                 checkpointing; Recovery",
  thesaurus =    "Operating systems [computers]; Parallel machines;
                 Software portability; System recovery",
}

@InProceedings{Simmunovic:1995:MIP,
  author =       "S. Simmunovic and T. Zacharia and N. Baltas and D. B.
                 Spalding",
  title =        "{MPI} Implementation of {Phoenics}: a General Purpose
                 Computational Fluid Dynamics Code",
  crossref =     "Tentner:1995:HPC",
  pages =        "122--127",
  year =         "1995",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Simunovic:1995:MIP,
  author =       "S. Simunovic and T. Zacharia and N. Baltas and D. B.
                 Spalding",
  title =        "{MPI} implementation of {PHOENICS}: a general purpose
                 computational fluid dynamics code",
  crossref =     "Tentner:1995:HPC",
  pages =        "122--127",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A4725Q (Convection and heat transfer); A4770F
                 (Chemically reactive flows); A8240 (Chemical kinetics
                 and reactions: special regimes); C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C6110B
                 (Software engineering techniques); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems software); C6185 (Simulation
                 techniques); C7320 (Physics and chemistry computing)",
  conftitle =    "Proceedings High Performance Computing `95",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "chemical reactions; chemically reactive flow;
                 chemistry computing; computational analysis programs;
                 digital simulation; dynamical reaction process
                 simulation; EARTH parallel version; flow simulation;
                 fluid dynamics; fluid flow simulation; general purpose
                 computational fluid dynamics code; heat transfer; heat
                 transfer simulation; heterogeneous computer networks;
                 high performance computing; Intel Paragon XP/S 35;
                 Intel Paragon XP/S 5; Kendall Square Research; large
                 scale computational simulations; massively parallel
                 supercomputers; message passing; Message Passing
                 Interface standard; MPI libraries; multiprocessing
                 systems; multiprocessor SGI Onyx computer; parallel
                 architectures; parallel machines; PHOENICS; physics
                 computing; portable computational tool; program
                 testing; scalable performance; software packages;
                 software performance evaluation; software portability",
  sponsororg =   "SCS",
  treatment =    "P Practical",
}

@Article{Sitsky:1995:IPM,
  author =       "D. Sitsky and D. Walsh and C. Johnson",
  title =        "Implementation and performance of the {MPI} message
                 passing interface on the {Fujitsu AP1000}
                 multicomputer",
  journal =      j-AUSTRALIAN-COMP-SCI-COMM,
  volume =       "17",
  number =       "1",
  pages =        "475--481",
  month =        "????",
  year =         "1995",
  CODEN =        "ACSCDD",
  ISSN =         "0157-3055",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Australian Nat. Univ.,
                 Canberra, ACT, Australia",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software); C6155 (Computer communications
                 software)",
  conflocation = "Glenelg, SA, Australia; 1-3 Feb. 1995",
  conftitle =    "Eighteenth Australasian Computer Science Conference.
                 ACSC'95",
  corpsource =   "Dept. of Comput. Sci., Australian Nat. Univ.,
                 Canberra, ACT, Australia",
  fjournal =     "Australian Computer Science Communications",
  keywords =     "application program interfaces; benchmarks;
                 Benchmarks; broadcasting; clustered systems; Clustered
                 systems; collective routines; Collective routines;
                 computer communications software; Fujitsu AP1000
                 multicomputer; group-wide broadcast; Group-wide
                 broadcast; hardware operations; Hardware operations;
                 implementation; Implementation; message passing;
                 Message Passing Interface; MPI; multiprocessing
                 systems; native calls; Native calls; operating system;
                 Operating system; parallel libraries; Parallel
                 libraries; parallel programming; performance;
                 Performance; portability; Portability; selective
                 broadcast operation; Selective broadcast operation;
                 software libraries; software performance evaluation",
  pubcountry =   "Australia",
  thesaurus =    "Application program interfaces; Broadcasting; Computer
                 communications software; Message passing;
                 Multiprocessing systems; Parallel programming; Software
                 libraries; Software performance evaluation",
  treatment =    "P Practical",
}

@InProceedings{Sivaraman:1995:PSP,
  author =       "H. Sivaraman and C. S. Raghavendra",
  title =        "Parallelizing sequential programs to a cluster of
                 workstations",
  crossref =     "Agrawal:1995:PIW",
  pages =        "38--41",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. Eng. and Comput. Sci., Washington
                 State Univ., Pullman, WA, USA",
  classification = "C5440 (Multiprocessing systems); C6115 (Programming
                 support); C6150C (Compilers, interpreters and other
                 processors)",
  keywords =     "ADAPTOR; ADDT; Automatic parallelization system;
                 AZTEC; Benchmark programs; Cluster of workstations;
                 Data distribution tool; GUI; HPF compiler; Parafrase-2
                 parallelizing compiler; PVM; Sequential programs
                 parallelisation; Source file",
  thesaurus =    "Parallel processing; Parallelising compilers; Software
                 tools; Workstations",
}

@Article{Skjellum:1995:EAM,
  author =       "Anthony Skjellum and Ewing Lusk and William Gropp",
  title =        "Early applications in the {Message-Passing Interface}
                 ({MPI})",
  journal =      j-IJSAHPC,
  volume =       "9",
  number =       "2",
  pages =        "79--94",
  month =        "Summer",
  year =         "1995",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We describe a number of early efforts to make use of
                 the Message-Passing Interface (MPI) standard in
                 applications, based on an informal survey conducted in
                 May-June, 1994. Rather than a definitive statement of
                 all MPI developmental work, this paper addresses the
                 initial successes, progress, and impressions that
                 application developers have had with MPI, according to
                 the responses received. We summarize the important
                 aspects of each survey response, and draw conclusions
                 about the spread of MPI into applications. An
                 understanding of message passing and access to the MPI
                 standard are prerequisites for appreciating this paper.
                 Some background material is provided to ease this
                 requirement.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mississippi State Univ",
  affiliationaddress = "Mississippi State, MS, USA",
  classification = "722.2; 722.3; 722.4; 902.2; C6150N (Distributed
                 systems software)",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
  journalabr =   "Int J Supercomput Appl High Perform Comput",
  keywords =     "Application developers; application developers;
                 Computer hardware; Data communication systems; message
                 passing; Message passing interface (MPI);
                 Message-Passing Interface; MPI standard; Network
                 protocols; software engineering; software standards;
                 Standards; Survey; survey; User interfaces",
  thesaurus =    "Message passing; Software engineering; Software
                 standards",
  treatment =    "P Practical",
}

@InProceedings{Skjellum:1995:EMP,
  author =       "A. Skjellum and N. E. Doss and K. Viswanathan and A.
                 Chowdappa and P. V. Bangalore",
  title =        "Extending the message passing interface ({MPI})",
  crossref =     "IEEE:1995:PSP",
  pages =        "106--118",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  keywords =     "computer networks; Computer networks; Europe; high
                 performance computing; High performance computing;
                 intercommunicator extensions; Intercommunicator
                 extensions; message passing; message passing interface;
                 Message passing interface; message passing standard;
                 Message passing standard; MPI Forum; multicomputers;
                 Multicomputers; multinational vendors; Multinational
                 vendors; national laboratories; National laboratories;
                 processor scheduling; research centers; Research
                 centers; scheduling; Scheduling; standards; United
                 States; universities; Universities; workstations;
                 Workstations",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Computer networks; Message passing; Processor
                 scheduling; Standards",
  treatment =    "P Practical",
}

@Article{Smith:1995:CRC,
  author =       "K. A. Smith and A. J. Baratta and G. E. Robinson",
  title =        "Coupled {RELAP5} and {CONTAIN} Accident Analysis Using
                 {PVM}",
  journal =      j-NUCLEAR-SAFETY,
  volume =       "36",
  number =       "1",
  pages =        "94--108",
  month =        jan # "--" # jun,
  year =         "1995",
  CODEN =        "NUSAAZ",
  ISSN =         "0029-5604",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This article describes the development of an
                 integrated accident analysis capability considering
                 both reactor vessel and containment system responses.
                 This integrated package, which uses the RELAP5 and
                 CONTAIN computer codes, provides the user with greater
                 accuracy and modeling flexibility when compared with
                 accident analyses using these codes separately.
                 Multiprocessing, together with message-passing-based
                 data transfer, enables these concurrent RELAP5 and
                 CONTAIN calculations. The data transfer facilitates the
                 coupling between the reactor vessel and containment
                 portions of the calculation. The Parallel Virtual
                 Machine software system running on a network of IBM
                 RISC System\slash 6000 workstations provided the
                 multiprocessing capabilities required for this work.
                 The results of an anticipated-transient-without-scram
                 scenario for a boiling-water reactor nuclear power
                 plant are provided. For the scenario analyzed, the
                 containment temperatures and pressures that were
                 predicted on the basis of the stand-alone codes and
                 standard analysis methods were lower (i.e., less
                 conservative) than those predicted with the use of the
                 integrated code package.",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Natl Lab",
  affiliationaddress = "Oak Ridge, TN, USA",
  classification = "621; 641.1; 723.2; 723.5; 914.1; 921.6; A2841C
                 (Computer codes for fission reactor theory and design);
                 A2844 (Fission reactor protection systems, safety and
                 accidents); A2850G (Light water reactors); C6150N
                 (Distributed systems software); C7470 (Nuclear
                 engineering computing)",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  fjournal =     "Nuclear safety",
  journalabr =   "Nucl Saf",
  keywords =     "anticipated-; Boiling water reactors; boiling-water
                 reactor; BWR; Calculations; capability; Codes
                 (symbols); Computer aided analysis; computer codes;
                 concurrent RELAP5/CONTAIN calculations; CONTAIN
                 computer codes; containment; containment system
                 responses; containment temperatures; Containment
                 vessels; coupled RELAP5/CONTAIN accident analysis; Data
                 transfer; engineering computing; engineering
                 workstations; fission; fission reactor accidents;
                 fission reactor design; IBM computers; IBM RISC
                 System/6000; instruction set computing; integrated
                 accident analysis; integrated package; Machine software
                 system; message passing; message-passing-based data
                 transfer; modeling flexibility; multiprocessing;
                 multiprocessing capabilities; multiprocessing programs;
                 nuclear; nuclear power plant; nuclear power stations;
                 Nuclear reactor accidents; Parallel processing systems;
                 Parallel Virtual; Parallel virtual machine software
                 system; Pressure; pressures; reactor containment;
                 reactor vessel; Reactor vessel and containment system;
                 reduced; RELAP5 computer codes; software packages;
                 stand-alone codes; Temperature; transient-without-scram
                 scenario; workstations",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Stagg:1995:IPN,
  author =       "A. K. Stagg and D. D. Cline and G. F. Carey",
  title =        "Implementing a parabolized {Navier--Stokes} flow
                 solver on the {Cray T3D}",
  crossref =     "Bailey:1995:PSS",
  pages =        "143--148",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Jet Propulsion Lab., Cray Res. Inc., Pasadena, CA,
                 USA",
  classification = "C4170 (Differential equations); C4240P (Parallel
                 programming and algorithm theory); C5440
                 (Multiprocessing systems); C6150N (Distributed systems
                 software); C7310 (Mathematics computing)",
  keywords =     "Cray T3D; Globally addressable memory; Hyperbolic
                 parabolic system; Interprocessor communication
                 routines; Large-scale simulation; Library calls;
                 Massively parallel architectures; Massively parallel
                 computers; Message passing; Parabolized Navier--Stokes
                 flow solver; Parallel Virtual Machine; Performance;
                 Performance results",
  thesaurus =    "Cray computers; Hyperbolic equations; Mathematics
                 computing; Message passing; Navier--Stokes equations;
                 Parabolic equations; Parallel algorithms; Parallel
                 machines; Software performance evaluation",
}

@InProceedings{Stals:1995:AMP,
  author =       "L. Stals",
  title =        "Adaptive multigrid in parallel",
  crossref =     "Bailey:1995:PSS",
  pages =        "367--372",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Australian Nat. Univ., Canberra, ACT, Australia",
  classification = "C4170 (Differential equations); C4185 (Finite
                 element analysis); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C7310
                 (Mathematics computing)",
  keywords =     "Adaptive multigrid; Adaptive refinement methods; C++;
                 Elliptic partial differential equations; Finite element
                 method; Kernighan-Lin method; Load balancing; MIMD
                 architectures; Multigrid methods; Multigrid programs;
                 Newest node bisection; Parallel multigrid; Polygonal
                 region; PVM; Square domains; Structured grids; Uniform
                 grids; Unstructured grids",
  thesaurus =    "Elliptic equations; Finite element analysis;
                 Mathematics computing; Parallel machines; Parallel
                 programming; Partial differential equations; Resource
                 allocation",
}

@InProceedings{Stankovski:1995:MPA,
  author =       "Z. Stankovski",
  title =        "A Massively Parallel Algorithm for the Collision
                 Probability Calculations in the {APOLLO-II} Code Using
                 the {PVM} Library",
  crossref =     "ANS:1995:MCR",
  volume =       "2",
  pages =        "1573--1583",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A2820H (Neutron diffusion); A2841C (Computer codes
                 for fission reactor theory and design); C6110P
                 (Parallel programming); C7470 (Nuclear engineering
                 computing)",
  corpsource =   "Dept. de Mecanique et Technol., Commissariat a
                 l`Energie Atomique, Gif sur Yvette, France",
  keywords =     "APOLLO-; collision probability; host/node
                 programmation model; II code; massively parallel
                 algorithm; message passing; neutron transport; neutron
                 transport theory; nuclear engineering computing;
                 parallel algorithms; parallel programming;
                 parallelization; PVM library",
  sponsororg =   "ANS; Eur. Nucl. Soc.; Atomic Energy Soc. Japan",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Stathopoulos:1995:DLB,
  author =       "A. Stathopoulos and A. Ynnerman",
  title =        "Dynamic load balancing of atomic structure programs on
                 a {PVM} cluster",
  crossref =     "Hertzberger:1995:HPM",
  pages =        "384--391",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Vanderbilt Univ., Nashville,
                 TN, USA",
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6150J (Operating systems)",
  corpsource =   "Dept. of Comput. Sci., Vanderbilt Univ., Nashville,
                 TN, USA",
  keywords =     "allocation; atomic data; Atomic data; atomic structure
                 programs; Atomic structure programs; cluster; dedicated
                 cluster of; Dedicated cluster of workstations; dynamic
                 load balancing; Dynamic load balancing; machine; MCHF
                 package; parallel processing; parallel virtual;
                 Parallel virtual machine; perfect load balancing;
                 Perfect load balancing; performance evaluation; PVM;
                 PVM cluster; resource; workstations",
  pubcountry =   "Germany",
  thesaurus =    "Parallel processing; Performance evaluation; Resource
                 allocation",
  treatment =    "A Application; P Practical",
}

@Article{Stellner:1995:CMP,
  author =       "G. Stellner and M. Schumann and M. Girnghuber",
  title =        "Comparing message-passing libraries with the {SPY}
                 analysis environment",
  journal =      j-IT-IT,
  volume =       "37",
  number =       "2",
  pages =        "46--52",
  month =        apr,
  year =         "1995",
  CODEN =        "ITINEV",
  ISSN =         "0944-2774",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support)",
  fjournal =     "Informationstechnik und technische Informatik: IT +
                 TI",
  keywords =     "Computational hard problems; Message-passing
                 libraries; Multi-user environments; Networks of
                 workstations; NXLib; P4; PVM; SPY analysis environment;
                 Virtual parallel computer",
  language =     "German",
  pubcountry =   "Germany",
  thesaurus =    "Message passing; Parallel processing; Parallel
                 programming; Programming environments",
}

@InProceedings{Stubbs:1995:ICE,
  author =       "S. S. Stubbs and D. L. Carver",
  title =        "{IPCC++}: a {C++} extension for interprocess
                 communication with objects",
  crossref =     "IEEE:1995:PNA",
  pages =        "205--210",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Louisiana State Univ., Baton
                 Rouge, LA, USA",
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6140D (High level languages);
                 C6150N (Distributed systems software); C7430 (Computer
                 engineering)",
  keywords =     "Asynchronous communication; C++ extension; Concurrency
                 primitives; Distributed memory models; Dynamic process
                 creation; Explicit concurrency; Inheritance;
                 Inter-object concurrency; Interprocess communication
                 objects; IPCC++; Language model; Object-oriented
                 programming languages; Orthogonality; Parallel Virtual
                 Machine; PVM; Selective waiting; Socket-based
                 application program interface; Static process creation;
                 Synchronous communication; Typed message passing
                 system; UNIX interprocess communication system calls
                 abstraction",
  thesaurus =    "C language; Distributed memory systems; Inheritance;
                 Message passing; Object-oriented languages; Parallel
                 programming; Unix; Virtual machines",
}

@InProceedings{Sunderam:1995:RIH,
  author =       "V. S. Sunderam",
  title =        "Recent initiatives in heterogeneous parallel
                 computing",
  crossref =     "Gray:1995:PCT",
  pages =        "1--16",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6110P (Parallel programming);
                 C6150N (Distributed systems software)",
  keywords =     "Concurrent computing; Concurrent distributed
                 computing; Distributed computing; Heterogeneous
                 parallel computing; Lightweight process; Parallel
                 virtual machine; Performance enhancement; Performance
                 evaluation; PVM; Research initiative; Thread",
  thesaurus =    "Open systems; Parallel processing; Performance
                 evaluation; Virtual machines",
}

@InProceedings{Suresh:1995:IOP,
  author =       "H. Suresh",
  title =        "Implementation of an optimal parallel algorithm for
                 arithmetic expression parsing",
  crossref =     "Narashimhan:1995:IIF",
  pages =        "925 vol.2",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Fac. of Sci. and Technol., Griffith Univ., Brisbane,
                 Qld., Australia",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5230 (Digital arithmetic methods); C6110P (Parallel
                 programming); C6150C (Compilers, interpreters and other
                 processors)",
  keywords =     "Arithmetic expression parsing; Concurrent processing
                 environment; Optimal parallel algorithm; Parallel
                 computer architectures; PVM; SIMD parallel
                 architecture; Simple recursive descent parser",
  thesaurus =    "Digital arithmetic; Parallel algorithms; Parallelising
                 compilers",
}

@InProceedings{Suresh:1995:PIQ,
  author =       "H. Suresh",
  title =        "{PVM} implementation of quadtree building algorithms
                 on {SIMD} hypercube system",
  crossref =     "Narashimhan:1995:IIF",
  volume =       "2",
  pages =        "855--858 (vol. 2)",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Microelectron. Eng., Griffith Univ., Brisbane,
                 Qld., Australia",
  classification = "C1160 (Combinatorial mathematics); C4240P (Parallel
                 programming and algorithm theory); C5260B (Computer
                 vision and image processing techniques); C6110P
                 (Parallel programming); C6120 (File organisation)",
  corpsource =   "Sch. of Microelectron. Eng., Griffith Univ., Brisbane,
                 Qld., Australia",
  keywords =     "algorithms; bilevel; Bilevel images; DOS operating
                 system; Hierarchical data structures; hierarchical data
                 structures; hypercube networks; hypercube system; IBM
                 compatible PCs; image processing; images; Linear
                 quadtrees; linear quadtrees; parallel algorithms;
                 Parallel Virtual Machine System; Parallel Virtual
                 Machine System Software; PVM implementation; Quadtree
                 building algorithms; quadtree building algorithms;
                 quadtrees; SIMD; SIMD hypercube system; Single
                 Instruction Multiple Data hypercube; Single Instruction
                 Multiple Data hypercube algorithms; Software",
  sponsororg =   "Parallel Algorithms, Archit. and Software Eng. Res.
                 Lab.; IEEE; IEEE Comput. Soc.; ACM; Euromicro; IBM;
                 Instn. Eng. Australia; Inst. Radio and Electron. Eng.
                 Soc.; Australian Comput. Soc",
  thesaurus =    "Hypercube networks; Image processing; Parallel
                 algorithms; Quadtrees",
  treatment =    "P Practical",
}

@Article{Swanson:1995:PAP,
  author =       "Eric Swanson and Terry P. Lybrand",
  title =        "{PVM-AMBER}: a parallel implementation of the {AMBER}
                 molecular mechanics package for workstation clusters",
  journal =      j-J-COMPUT-CHEM,
  volume =       "16",
  number =       "9",
  pages =        "1131--1140",
  month =        sep,
  year =         "1995",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.540160907",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Thu Nov 29 14:54:31 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Center for Bioeng., Washington Univ., Seattle, WA,
                 USA",
  classification = "A3620C (Macromolecular conformation (statistics and
                 dynamics)); A8710 (General, theoretical, and
                 mathematical biophysics); A8715D (Physical chemistry of
                 biomolecular solutions; A8715H (Biomolecular dynamics,
                 molecular probes, molecular pattern recognition);
                 C6150N (Distributed systems software); C6185
                 (Simulation techniques); C7320 (Physics and chemistry
                 computing); C7330 (Biology and medical computing);
                 condensed states)",
  corpsource =   "Center for Bioeng., Washington Univ., Seattle, WA,
                 USA",
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  keywords =     "AMBER molecular mechanics package parallel version;
                 biology computing; biomolecular simulation;
                 Biomolecular simulation problems; Computational
                 speedup; computational speedup; computations; Data
                 exchange; data exchange; digital simulation;
                 efficiency; Ethernet; FDDI; FDDI connections; free
                 energy; Free-energy perturbation computations;
                 free-energy perturbation computations; intermolecular
                 mechanics; Lipid bilayer systems; lipid bilayer
                 systems; lipid bilayers; local area; molecular
                 biophysics; molecular dynamics; Molecular dynamics
                 computations; molecular dynamics method; networks;
                 Nonbonded energies; nonbonded energies; Nonbonded
                 forces; nonbonded forces; Nonbonded pair list
                 generation; nonbonded pair list generation; packages;
                 parallel; Parallel efficiency; parallel processing;
                 peptide; perturbation theory; problems; Processor
                 synchronization; processor synchronization; Protein;
                 protein; proteins; PVM message-passing software;
                 PVM-AMBER; Silicon Graphics; software; solvated;
                 Solvated peptide; Test simulations; test simulations;
                 Unix; Unix workstations; Workstation clusters;
                 workstation clusters; workstations",
  onlinedate =   "7 Sep 2004",
  thesaurus =    "Biology computing; Digital simulation; FDDI; Free
                 energy; Intermolecular mechanics; Lipid bilayers; Local
                 area networks; Molecular biophysics; Molecular dynamics
                 method; Parallel processing; Perturbation theory;
                 Proteins; Software packages; Unix; Workstations",
  treatment =    "P Practical",
}

@InProceedings{Ten:1995:TPE,
  author =       "S. V. Ten and V. V. Savchenko and A. A. Pasko",
  title =        "Time performance evaluation of implicit surface
                 polygonization on distributed systems",
  crossref =     "Gray:1995:PCT",
  pages =        "183--193",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Distributed Parallel Process. Lab., Aizu Univ.,
                 Aizu-Wakamatsu City, Japan",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C4260 (Computational geometry); C6130B (Graphics
                 techniques); C6150N (Distributed systems software);
                 C7310 (Mathematics computing); C7400 (Engineering
                 computing)",
  keywords =     "CAD system; Complex surfaces; Distributed systems;
                 Functions; Implicit functions; Implicit surface
                 polygonization; Mathematics; Parallelization; Polygonal
                 approximation; PVM system; Rendering; Scalable
                 algorithm; Software algorithm; Solids; Time performance
                 evaluation; Toroidal architecture; Transputer network;
                 Visual analysis",
  thesaurus =    "CAD; Computational geometry; Data visualisation;
                 Engineering graphics; Functions; Mathematics computing;
                 Message passing; Parallel algorithms; Parallel
                 architectures; Rendering [computer graphics]; Software
                 performance evaluation; Transputer systems",
}

@InProceedings{Tsunekawa:1995:EIE,
  author =       "H. Tsunekawa",
  title =        "Effective implementation of {EDEM} workstation cluster
                 using {PVM}",
  crossref =     "Pahl:1995:CCB",
  pages =        "503--508",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Uhl:1995:AWA,
  author =       "A. Uhl",
  title =        "Adapted wavelet analysis on moderate parallel
                 distributed memory {MIMD} architectures",
  crossref =     "Ferreira:1995:PAI",
  pages =        "275--283",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Res. Inst. for Softwaretechnol., Salzburg Univ.,
                 Austria",
  classification = "B0230 (Integral transforms); B0290Z (Other numerical
                 methods); B6120B (Codes); B6140C (Optical information,
                 image and video signal processing); C1130 (Integral
                 transforms); C4190 (Other numerical methods); C4240P
                 (Parallel programming and algorithm theory); C5220P
                 (Parallel architecture); C5260B (Computer vision and
                 image processing techniques); C6150N (Distributed
                 systems software)",
  keywords =     "Adapted wavelet analysis; Algorithm efficiency;
                 Decomposition; Image compression; Moderate parallel
                 distributed memory MIMD architectures; PVM; Subband
                 based parallelization; Wavelet packet best basis
                 selection; Workstation cluster",
  thesaurus =    "Distributed memory systems; Image coding; Parallel
                 algorithms; Parallel architectures; Wavelet
                 transforms",
}

@InProceedings{Uhl:1995:PCC,
  author =       "A. Uhl",
  title =        "Parallel Compact Coding of Satellite Images with
                 Wavelet Packets using {PVM}",
  crossref =     "Prasanna:1995:FIP",
  pages =        "382--387",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6120B (Codes); B6140C (Optical information, image
                 and video signal processing); C4190 (Other numerical
                 methods); C5260B (Computer vision and image processing
                 techniques); C5440 (Multiprocessing systems); C6110P
                 (Parallel programming); C6130 (Data handling
                 techniques); C7460 (Aerospace engineering computing)",
  corpsource =   "Res. Inst. for Softwaretechnol., Salzburg Univ.,
                 Austria",
  keywords =     "aerospace computing; artificial satellites;
                 compression; data; image; image coding; image
                 compression methods; parallel; parallel approach;
                 parallel compact coding; parallel machines; parallel
                 programming environment; processing; programming; PVM;
                 quality; satellite data; satellite images; wavelet
                 packet decomposition; wavelet packet decompositions;
                 wavelet packets; wavelet transforms",
  pubcountry =   "India",
  treatment =    "P Practical",
}

@InProceedings{Uhl:1995:VPW,
  author =       "A. Uhl",
  title =        "Vector and parallel wavelet transforms for the
                 analysis of time-varying signals",
  crossref =     "Bailey:1995:PSS",
  pages =        "9--14",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "RIST, Salzburg Univ., Austria",
  classification = "C1130 (Integral transforms); C4190 (Other numerical
                 methods); C4240P (Parallel programming and algorithm
                 theory); C5260 (Digital signal processing); C5440
                 (Multiprocessing systems); C5620L (Local area
                 networks); C6110P (Parallel programming); C6150N
                 (Distributed systems software)",
  keywords =     "Asynchronous task pool; Continuous wavelet transform;
                 Convex C3440 Vectorcomputer; Load balancing;
                 Master-slave programming scheme; Parallel wavelet
                 transforms; PVM; Speed-up; Time-varying signal
                 analysis; Timing; Vector wavelet transforms;
                 Workstation cluster",
  thesaurus =    "Local area networks; Parallel algorithms; Parallel
                 programming; Resource allocation; Signal processing;
                 Time-varying systems; Timing; Vector processor systems;
                 Wavelet transforms; Workstations",
}

@Article{Vaughan:1995:MPM,
  author =       "Paula L. Vaughan and Anthony Skjellum and Donna S.
                 Reese and Fei-Chen Cheng",
  title =        "Migrating from {PVM} to {MPI}, part {I}: The {Unify}
                 system",
  journal =      j-FRONTIERS-MASS-PAR-COMP-CONF-PROC,
  pages =        "488--495",
  month =        "????",
  year =         "1995",
  bibdate =      "Fri May 24 09:57:40 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95TH8024.",
  abstract =     "A new kind of portability system for modifying the PVM
                 message passing system to generate the Message Passing
                 Interface (MPI) standard notation for message passing
                 is described. The system, known as Unify, is designed
                 to reduce the effort of learning MPI while providing a
                 sensible means to make use of MPI libraries and MPI
                 calls. It also allows the immediate use of MPI-based
                 parallel libraries in applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mississippi State Univ",
  affiliationaddress = "Mississippi State, MS, USA",
  classification = "721.1; 722.3; 722.4; 723.2; 902.2; 921.6",
  conference =   "Proceedings of the 5th Symposium on the Frontiers of
                 Massively Parallel Computation",
  fjournal =     "Frontiers of Massively Parallel Computation ---
                 Conference Proceedings",
  journalabr =   "Front Massively Parallel Comput Conf Proc",
  keywords =     "Computational linguistics; Computer software
                 portability; Computer workstations; Data communication
                 systems; Data handling; Data structures; Database
                 systems; Interfaces (computer); Mathematical models;
                 Message passing; Message Passing Interface; Parallel
                 processing systems; Standards",
  meetingaddress = "McLean, VA, USA",
  meetingdate =  "Feb 6--9 1995",
  meetingdate2 = "02/06--09/95",
  sponsor =      "IEEE Computer Society",
}

@Article{Vincent:1995:HPP,
  author =       "James J. Vincent and Kenneth M. {Merz Jr.}",
  title =        "A highly portable parallel implementation of {AMBER4}
                 using the message passing interface standard",
  journal =      j-J-COMPUT-CHEM,
  volume =       "16",
  number =       "11",
  pages =        "1420--1427",
  month =        nov,
  year =         "1995",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.540161110",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Thu Nov 29 14:54:32 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Chem., Pennsylvania State Univ., University
                 Park, PA, USA",
  classification = "A3620 (Macromolecules and polymer molecules); A6120J
                 (Computer simulation of static and dynamic liquid
                 behaviour); A8715 (Molecular biophysics); C5220P
                 (Parallel architecture); C7320 (Physics and chemistry
                 computing)",
  corpsource =   "Dept. of Chem., Pennsylvania State Univ., University
                 Park, PA, USA",
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  keywords =     "AMBER4; Cray C90; Cray T3D; free energy; free-energy
                 perturbation module Gibbs; Free-energy perturbation
                 module Gibbs; IBM SP1/SP2; lipid bilayer molecular
                 dynamics simulation; Lipid bilayer molecular dynamics
                 simulation; macromolecular modeling package;
                 Macromolecular modeling package; macromolecules;
                 message passing; message passing interface standard;
                 Message passing interface standard; MINMD; molecular
                 biophysics; molecular dynamics method; molecular
                 dynamics/minimization module; Molecular
                 dynamics/minimization module; networked workstations;
                 Networked workstations; perturbation theory; physics
                 computing; portable parallel implementation; Portable
                 parallel implementation",
  onlinedate =   "7 Sep 2004",
  thesaurus =    "Free energy; Macromolecules; Message passing;
                 Molecular biophysics; Molecular dynamics method;
                 Perturbation theory; Physics computing",
  treatment =    "T Theoretical or Mathematical",
}

@MastersThesis{Viswanathan:1995:PCM,
  author =       "Kishore Viswanathan",
  title =        "A parallel client-server model for distributed
                 computing",
  type =         "M.S. thesis",
  school =       "Department of Computer Science, " # inst-MSU,
  address =      inst-MSU:adr,
  pages =        "vii + 79",
  year =         "1995",
  bibdate =      "Mon Jan 15 16:53:06 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  alttitle =     "Distributed computing. Message Passing Interface Forum
                 (MPIF) MPI-Forum 1994",
  keywords =     "Client/server computing.; Electronic data processing
                 --- Distributed processing; Mississippi State
                 University --- Thesis --- (1995); Parallel programming
                 (computer science)",
}

@InProceedings{Vlassov:1995:MEP,
  author =       "V. Vlassov and H. Ahmed and L.-E. Thorelli",
  title =        "{mEDA-2}: An Extension of {PVM}",
  crossref =     "Malyshkin:1995:PCT",
  pages =        "288--293",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Electrum 204, R. Inst. of Technol., Kista, Sweden",
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  corpsource =   "Electrum 204, R. Inst. of Technol., Kista, Sweden",
  keywords =     "communication; EDA model; environments; intertask;
                 Intertask communication; mEDA-2; MEDA-2; message
                 passing; Message passing; parallel program; Parallel
                 program termination; parallel programming; parallel
                 programs; Parallel programs; programming; programming
                 environments; Programming environments; PVM; shared
                 memory systems; synchronisation; synchronization;
                 Synchronization; termination; virtual shared memory;
                 Virtual shared memory; VSM",
  pubcountry =   "Germany",
  thesaurus =    "Message passing; Parallel programming; Programming
                 environments; Shared memory systems; Synchronisation",
  treatment =    "P Practical",
}

@InProceedings{Walker:1995:MVB,
  author =       "D. W. Walker",
  title =        "An {MPI} version of the {BLACS}",
  crossref =     "IEEE:1995:PSP",
  pages =        "129--146",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  classification = "C4140 (Linear algebra); C6110B (Software engineering
                 techniques); C6150N (Distributed systems software)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  keywords =     "Basic Linear Communication Subprograms; BLACS;
                 blocking; Blocking; functionality; Functionality;
                 linear algebra; message passing; message passing
                 standard; Message passing standard; MPI; MPI
                 communication modes; MPI Linear Algebra Communication
                 Subprograms; nonblocking communication; Nonblocking
                 communication; software libraries; standards;
                 subroutines",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Linear algebra; Message passing; Software libraries;
                 Standards; Subroutines",
  treatment =    "P Practical",
}

@TechReport{Walker:1995:RBD,
  author =       "David W. Walker and Steve W. Otto",
  title =        "Redistribution of Block-Cyclic Data Distributions
                 Using {MPI}",
  number =       "ORNL/TM-12999",
  institution =  inst-ORNL,
  address =      inst-ORNL:adr,
  pages =        "iii + 20",
  month =        jun,
  year =         "1995",
  bibdate =      "Tue Jan 16 08:37:06 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/~walker/mpi/redistribution.ps.Z",
}

@InProceedings{Wang:1995:PPG,
  author =       "Cho-Li Wang and V. K. Prasanna and Young Won Lim",
  title =        "Parallelization of perceptual grouping on distributed
                 memory machines",
  crossref =     "Cantoni:1995:CCA",
  pages =        "323--330",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. Eng. Syst., Univ. of Southern
                 California, Los Angeles, CA, USA",
  classification = "B6140C (Optical information, image and video signal
                 processing); C4240C (Computational complexity); C4240P
                 (Parallel programming and algorithm theory); C5220P
                 (Parallel architecture); C5260B (Computer vision and
                 image processing techniques); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming)",
  keywords =     "16 Node Cray T3D; Architecture independent parallel
                 algorithms; CM-5; Communication startup time;
                 Communication time; Computation time; Distributed
                 memory machines; High performance computing platforms;
                 Line segment extraction; MPI message passing standard;
                 Perceptual grouping; Processing nodes; Transmission
                 rate",
  thesaurus =    "Communication complexity; Computational complexity;
                 Distributed memory systems; Edge detection; Feature
                 extraction; Message passing; Parallel algorithms",
}

@Article{Wasniowski:1995:NAP,
  author =       "R. A. Wasniowski",
  title =        "Nonlinear adaptive prediction algorithm and its
                 parallel implementation",
  journal =      j-INFORMATICA,
  volume =       "19",
  number =       "3",
  pages =        "371--377",
  month =        sep,
  year =         "1995",
  CODEN =        "INFOFF",
  ISSN =         "0350-5596",
  ISSN-L =       "0350-5596",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "New Mexico Highlands Univ., Las Vagas, NM, USA",
  classification = "C1220 (Simulation, modelling and identification);
                 C1240 (Adaptive system theory); C4240P (Parallel
                 programming and algorithm theory); C6110P (Parallel
                 programming); C6185 (Simulation techniques); C7400
                 (Engineering computing)",
  fjournal =     "Informatica (Ljubljana, Slovenia)",
  keywords =     "Computation times; Computationally-intensive
                 engineering problems; Cost/performance ratio; Group
                 method of data handling; Heterogeneous machines; Large
                 parallel programs; Massively parallel computers;
                 Nonlinear adaptive prediction algorithm; Parallel
                 algorithm development; Parallel simulators; PVM;
                 Software packages; Systems identification; Workstation
                 networks",
  pubcountry =   "Slovenia",
  thesaurus =    "Adaptive estimation; Digital simulation; Engineering
                 computing; Forecasting theory; Identification; Parallel
                 algorithms",
}

@TechReport{Werner:1995:UMP,
  author =       "J{\"o}rg Werner",
  title =        "{{\"U}berblick zum Message-Passing-Interface Standard,
                 MPI}. ({German}) [{Overview} of the {Message-Passing
                 Interface Standard, MPI}]",
  type =         "{Parlab-Mitteilungen}",
  number =       "04/95",
  institution =  "Technische Universit{\"a}t Chemnitz-Zwickau",
  address =      "Chemnitz, Germany",
  pages =        "35",
  year =         "1995",
  bibdate =      "Wed Aug 27 06:21:48 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  language =     "German",
}

@InProceedings{West:1995:AVV,
  author =       "J. E. West and M. M. Stephens and L. H. Turcotte",
  title =        "Adaptation of volume visualization techniques to
                 {MIMD} architectures using {MPI}",
  crossref =     "IEEE:1995:PSP",
  pages =        "147--156",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "DoD High Performance Comput. Center, US Army Eng.
                 Waterways Exp. Station, Vicksburg, MS, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5440 (Multiprocessing systems); C6130B (Graphics
                 techniques); C6150N (Distributed systems software);
                 C7300 (Natural sciences computing)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "DoD High Performance Comput. Center, US Army Eng.
                 Waterways Exp. Station, Vicksburg, MS, USA",
  keywords =     "data visualisation; distributed memory parallel
                 computers; Distributed memory parallel computers;
                 distributed memory systems; divide and conquer methods;
                 divide-and-conquer approach; Divide-and-conquer
                 approach; high resolution volume datasets; High
                 resolution volume datasets; interprocessor
                 communication; Interprocessor communication; message
                 passing; Message Passing Interface; MIMD architectures;
                 MPI; nCUBE 2; NCUBE 2; parallel algorithm; Parallel
                 algorithm; parallel algorithms; parallel architectures;
                 parallel implementation; Parallel implementation;
                 parallel machines; rendering (computer graphics); scene
                 generation; Scene generation; scientific analysis;
                 Scientific analysis; sequential algorithm; Sequential
                 algorithm; standards; volume rendering method; Volume
                 rendering method; volume visualization techniques;
                 Volume visualization techniques",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Data visualisation; Distributed memory systems; Divide
                 and conquer methods; Message passing; Parallel
                 algorithms; Parallel architectures; Parallel machines;
                 Rendering [computer graphics]; Standards",
  treatment =    "A Application; P Practical; T Theoretical or
                 Mathematical",
}

@Article{White:1995:PNP,
  author =       "S. White and A. Alund and V. S. Sunderam",
  title =        "Performance of the {NAS} Parallel Benchmarks on
                 {PVM-Based} Networks",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "26",
  number =       "1",
  pages =        "61--71",
  day =          "1",
  month =        apr,
  year =         "1995",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1995.1048",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:57 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1995.1048/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1995.1048/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6100 (Software techniques and
                 systems); C7320 (Physics and chemistry computing)",
  corpsource =   "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "aerodynamics; computational; computing; concurrent;
                 Ethernet; FDDI networks; kernel benchmarks; NAS
                 parallel benchmarks; parallel processing; performance
                 evaluation; PVM system; PVM-based networks; software
                 performance evaluation",
  treatment =    "P Practical",
}

@InProceedings{Xu:1995:IPP,
  author =       "H. Xu and T. W. Fisher",
  title =        "Improving {PVM} Performance using {ATOMIC} User-Level
                 Protocol",
  crossref =     "Alnuweiri:1995:PHF",
  pages =        "108--117",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Yonezawa:1995:IED,
  author =       "Naoki Yonezawa and Koichi Wada and Motoko Obata",
  title =        "Implementation and evaluation of distributed shared
                 data objects on a workstation cluster",
  crossref =     "IEEE:1995:IPR",
  pages =        "319--322",
  year =         "1995",
  bibdate =      "Fri May 24 09:58:00 MDT 1996",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95CH35765.",
  abstract =     "We are developing a system called KaReN to handle
                 distributed shared data objects on workstations that
                 are connected by Ethernet. The system supplied users a
                 parallel programming environment with virtually shared
                 data objects. The KaReN was developed using the message
                 passing library PVM (Parallel Virtual Machine) to have
                 good portability. To reduce overhead in maintaining
                 data coherence, several methods are introduced. The
                 request merging is introduced to reduce message
                 traffic. The copy transfer messages are also clumped
                 when possible. The weak consistency is another
                 optimization for eliminating unnecessary coherence
                 control message by allowing temporally inconsistent
                 state. This paper presents the organization and the
                 implementation of KaReN. Several applications have been
                 executed for evaluation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Tsukuba",
  affiliationaddress = "Ibaraki, Jpn",
  classification = "722.3; 722.4; 723.1; 723.2; 921.5; C5620L (Local
                 area networks); C6110P (Parallel programming); C6115
                 (Programming support); C6150N (Distributed systems
                 software)",
  conference =   "Proceedings of the 1995 IEEE Pacific RIM Conference on
                 Communications, Computers, and Signal Processing",
  journalabr =   "IEEE Pac RIM Conf Commun Comput Signal Process Proc",
  keywords =     "Coherence control message; Computer networks; Computer
                 software portability; Computer workstations; Copy
                 transfer messages; Data coherence; Data handling; Data
                 structures; Distributed computer systems; Distributed
                 shared data objects; Ethernet; KaReN; Message passing
                 library; Message traffic reduction; Object oriented
                 programming; Optimization; Parallel programming
                 environment; Parallel virtual machine; Parallel virtual
                 machine (PVM); Portability; Subroutines; Virtually
                 shared data objects; Weak consistency; Workstation
                 cluster",
  meetingaddress = "Victoria, BC, Can",
  meetingdate =  "May 17--19 1995",
  meetingdate2 = "05/17--19/95",
  sponsor =      "IEEE",
  thesaurus =    "Local area networks; Message passing; Network
                 operating systems; Parallel programming; Programming
                 environments; Software portability; Virtual machines;
                 Workstations",
}

@Article{Yong:1995:SOM,
  author =       "Dou Yong and Zhou Xingming",
  title =        "Super-Object model: implementing shared memory
                 programming mode on distributed memory multicomputers",
  journal =      j-CHIN-J-COMPUTERS,
  volume =       "18",
  number =       "7",
  pages =        "481--487",
  month =        jul,
  year =         "1995",
  CODEN =        "JIXUDT",
  ISSN =         "0254-4164",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci, Univ. of Defence Technol.,
                 Changsha, China",
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming)",
  fjournal =     "Chinese Journal of Computers = Chi suan chi hsueh
                 pao",
  keywords =     "Distributed memory multicomputers; Fortran 77; Global
                 address; Implementation; Message passing primitives;
                 Oak Ridge PVM; Performance; Prototype system; Run-time
                 system; Shared memory parallel programming; Shared
                 memory programming mode; Super-Object model; UNIX
                 operating system",
  language =     "Chinese",
  pubcountry =   "China",
  thesaurus =    "Distributed memory systems; Message passing; Parallel
                 programming",
}

@Article{You:1995:EIM,
  author =       "J. You and E. Pissaloux and W. P. Zhu and H. A.
                 Cohen",
  title =        "Efficient image matching: a hierarchical {Chamfer}
                 matching scheme via distributed system",
  journal =      j-REAL-TIME-IMAGING,
  volume =       "1",
  number =       "4",
  pages =        "245--259",
  month =        oct,
  year =         "1995",
  CODEN =        "REIMFQ",
  ISSN =         "1077-2014",
  ISSN-L =       "1077-2014",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. and Inf. Sci., South Australia Univ.,
                 SA, Australia",
  classification = "B6140C (Optical information, image and video signal
                 processing); C5220P (Parallel architecture); C5260B
                 (Computer vision and image processing techniques)",
  fjournal =     "Real-Time Imaging",
  keywords =     "Chamfer matching scheme; Distance transform;
                 Distributed system; Dynamic thresholding; Edge points;
                 Image matching; Parallel implementation; Parallel
                 Virtual Machine; Pyramid",
  pubcountry =   "UK",
  thesaurus =    "Distributed processing; Image matching; Virtual
                 machines",
}

@InProceedings{You:1995:PIM,
  author =       "J. You and W. P. Zhu and E. Pissaloux and H. A.
                 Cohen",
  title =        "Parallel image matching on a distributed system",
  crossref =     "Narashimhan:1995:IIF",
  pages =        "870--873 (vol. 2)",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. and Inf. Sci, Univ. of South
                 Australia, The Levels, SA, Australia",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5260B (Computer vision and image processing
                 techniques); C6110P (Parallel programming)",
  keywords =     "Distance transform; Distributed memory multicomputer;
                 Distributed system; Heavily iterated computation; Image
                 feature extraction; Image feature pixels; Low cost
                 heterogeneous PVM network; Message-passing; Object
                 recognition; Parallel image matching; Parallel virtual
                 machine; Repeated memory access",
  thesaurus =    "Feature extraction; Image matching; Message passing;
                 Object recognition; Parallel algorithms",
}

@InProceedings{Zareski:1995:EPG,
  author =       "D. Zareski and B. Wade and P. Hubbard and P. Shirley",
  title =        "Efficient parallel global illumination using density
                 estimation",
  crossref =     "Uselton:1995:PRS",
  pages =        "47--54, 104--105",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Program of Comput. Graphics, Cornell Univ., Ithaca,
                 NY, USA",
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6130B (Graphics
                 techniques); C6150N (Distributed systems software)",
  keywords =     "Arbitrary nondiffuse surfaces; Density estimation;
                 Diffuse inter-reflections; Efficient parallel global
                 illumination; Energy transport; Gouraud-shaded
                 elements; High geometric complexity environments;
                 Interactive walk-throughs; Local area network; Master
                 task; Meshing phase; Multicomputer parallel density
                 estimation global illumination method; Multiple worker
                 tasks; Parallel programs; Parallelization;
                 Particle-tracing phase; PVM software package;
                 Radiosity; Ray-traced images; Shared file system; Still
                 frames; Workstations",
  thesaurus =    "Brightness; Density; Lighting; Local area networks;
                 Parallel algorithms; Parallel programming; Ray tracing;
                 Realistic images; Rendering [computer graphics];
                 Workstations",
}

@InProceedings{Zelek:1995:DPP,
  author =       "J. S. Zelek",
  title =        "Dynamic path planning",
  crossref =     "IEEE:1995:IIC",
  pages =        "1285--1290 (vol. 2)",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. Eng., McGill Univ., Montreal, Que.,
                 Canada",
  classification = "C1230 (Artificial intelligence); C3390C (Mobile
                 robots); C7420 (Control engineering computing)",
  keywords =     "Dynamic path planning; Harmonic function; Message
                 passing software package; Navigation; Nomad robot;
                 Potential field; PVM; SPARC and SGI workstations",
  thesaurus =    "Computerised control; Dynamics; Harmonics; Message
                 passing; Mobile robots; Navigation; Path planning",
}

@InProceedings{Zhou:1995:FMP,
  author =       "H. Zhou and A. Geist",
  title =        "Faster Message Passing in {PVM}",
  crossref =     "Alnuweiri:1995:PHF",
  pages =        "67--73",
  year =         "1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Zhou:1995:RMR,
  author =       "Honbo Zhou and Al Geist",
  title =        "``Receiver Makes Right'' Data Conversion in {PVM}",
  crossref =     "IEEE:1995:CPI",
  pages =        "458--464",
  year =         "1995",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Using a Receiver Makes it Right (RMR) data conversion
                 technique in PVM significantly improves the
                 message-passing performance in heterogeneous
                 environments. The improvements are due to two factors:
                 (1). RMR reduces the need for conversions in a
                 heterogeneous environment; (2). At most each message is
                 converted, only once compared to twice for XDR used in
                 public version of PVM, and our conversion routines are
                 streamlined and are several times faster than the XDR
                 routines. The drawback to RMR is the potential need for
                 a large number of conversion routines. We demonstrate
                 that only a small number of routines are required
                 because many vendors use the IEEE standard for data
                 representation. Given this fact, RMR may emerge as a
                 promising technique in distributed computing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Sect., Oak Ridge Nat. Lab.",
  affiliationaddress = "Oak Ridge, TN, USA",
  classification = "722.1; 722.3; 722.4; 723.1; 723.2; C5440
                 (Multiprocessing systems); C6120 (File organisation);
                 C7430 (Computer engineering)",
  conference =   "Proceedings of the 1995 IEEE 14th Annual International
                 Phoenix Conference on Computers and Communications",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  journalabr =   "Conf Proc Int Phoenix Conf on Comput Commun",
  keywords =     "Buffer storage; Computer software; Computer systems
                 programming; conversion; Conversion routines; Data
                 communication systems; data conversion; Data
                 processing; data structures; Decoding; distributed
                 computing; Distributed computing; Encoding (symbols);
                 heterogeneous environments; Heterogeneous environments;
                 Local area networks; machines; Message passing
                 performance, Data conversion; message-; Message-passing
                 performance; parallel machines; Parallel processing
                 systems; parallel virtual machine; Parallel virtual
                 machine; Parallel virtual machine (PVM); passing
                 performance; PVM; Receiver makes it right (RMR) data
                 conversion; routines; virtual",
  meetingaddress = "Scottsdale, AZ, USA",
  meetingdate =  "Mar 28--31 1995",
  meetingdate2 = "03/28--31/95",
  thesaurus =    "Data conversion; Data structures; Parallel machines;
                 Virtual machines",
  treatment =    "P Practical",
}

@Article{Zhu:1995:RTC,
  author =       "Miaoliang Zhu and Chunming Wu and Youjun Zhang and Yi
                 Jin and Jie Li",
  title =        "A real-time and concurrent intelligent robotic system
                 based on multi-agent architecture",
  journal =      j-HIGH-TECH-LETT,
  volume =       "5",
  number =       "10",
  pages =        "20--24",
  month =        oct,
  year =         "1995",
  CODEN =        "GTONE8",
  ISSN =         "1002-0470",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Artificial Intelligence Inst., Zhejiang Univ.,
                 Hangzhou, China",
  classification = "C1340D (Discrete control systems); C3390C (Mobile
                 robots); C4220 (Automata theory); C5220P (Parallel
                 architecture); C6150N (Distributed systems software);
                 C6170 (Expert systems); C7420 (Control engineering
                 computing); C7430 (Computer engineering)",
  fjournal =     "High Technology Letters",
  keywords =     "Automata; Autonomous mobile robots; Concurrent
                 intelligent robotic system; Discrete event-finite state
                 transformation model; Intelligent architecture;
                 Multi-agent architecture; Multi-computer coherence
                 environment; Parallel virtual machine; Pipeline
                 scheduler; PVM; Real-time Multi-Agent System; RMAS;
                 ROBIX; Simulation",
  language =     "Chinese",
  pubcountry =   "China",
  thesaurus =    "Cooperative systems; Discrete event systems; Finite
                 automata; Intelligent control; Mobile robots; Parallel
                 processing; Pipeline processing; Real-time systems;
                 Scheduling; Virtual machines",
}

@InProceedings{Zhuang:1995:PRS,
  author =       "Xinglai Zhuang and Jianping Zhu",
  title =        "Parallelizing a reservoir simulator using {MPI}",
  crossref =     "IEEE:1995:PSP",
  pages =        "165--174",
  year =         "1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "NSF Eng. Res. Center for Comput. Field Simulations,
                 Mississippi State Univ., MS, USA",
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software); C7340 (Geophysics computing); C7490
                 (Computing in other engineering fields)",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "NSF Eng. Res. Center for Comput. Field Simulations,
                 Mississippi State Univ., MS, USA",
  keywords =     "customized communication library; Customized
                 communication library; customized communication
                 subroutines; Customized communication subroutines;
                 digital simulation; geophysics computing; IBM SP1/SP2;
                 Intel; Intel iPSC/860; message passing; Message Passing
                 Interface; MPI; NX communication library; oil
                 technology; parallel architecture; Parallel
                 architecture; parallel code performance; Parallel code
                 performance; parallel code portability; Parallel code
                 portability; parallel computers; Parallel computers;
                 parallel programming; performance; Performance;
                 reservoir simulator; Reservoir simulator; scalability;
                 Scalability; software libraries; standards;
                 subroutines; workstation clusters; Workstation
                 clusters",
  sponsororg =   "Mississippi State Univ.; NSF",
  thesaurus =    "Digital simulation; Geophysics computing; Message
                 passing; Oil technology; Parallel programming; Software
                 libraries; Standards; Subroutines",
  treatment =    "A Application; P Practical",
}

@InProceedings{Alt:1996:PIA,
  author =       "R. Alt and J. L. Lamotte",
  title =        "Parallel integration across time of initial value
                 problems using {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "323--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4130 (Interpolation and function approximation);
                 C4170 (Differential equations)C6150N (Distributed
                 systems software); C4240P (Parallel programming and
                 algorithm theory); C7310 (Mathematics computing)",
  corpsource =   "MASI and Institut Blaise Pascal, Paris, France",
  keywords =     "approximation theory; collocation; Connection Machine
                 CM5; differential; differential equations; distributed
                 architectures; divided differences; equation; initial
                 value; initial value problems; linear system;
                 mathematics computing; method; nonlinear system;
                 parallel; parallel algorithm; parallel algorithms;
                 parallel integration; parallel machines; Picard
                 iterations; polynomial approximation; problems; virtual
                 machine",
  pubcountry =   "Germany",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Anglano:1996:PMB,
  author =       "C. Anglano and L. Portinale",
  title =        "Parallel Model-Based Diagnosis Using {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "331--334",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)C1160 (Combinatorial mathematics);
                 C7440 (Civil and mechanical engineering computing)",
  corpsource =   "Dipartimento di Inf., Universita' di Torino, Italy",
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "automobiles; car fault diagnosis; computing; fault
                 diagnosis; identification; mechanical engineering;
                 message passing; methods; MIMD message passing program;
                 parallel; parallel backward reachability; parallel
                 machines; parallel model-based diagnosis; parallel
                 programs; parallel virtual machine; Petri net model;
                 Petri nets; programming; reachability analysis; space;
                 state; state-space; virtual machines",
  pubcountry =   "Germany",
  treatment =    "A Application; P Practical",
}

@Article{Anonymous:1996:BRMh,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{MPI: the compete
                 reference}}: By Marc Snir, Steve Otto, Steven
                 Huss-Lederman, David Walker, and Jack Dongarra. MIT
                 Press, Cambridge, MA. (1996). 336 pages. \$27.50}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "31",
  number =       "11",
  pages =        "140--140",
  month =        jun,
  year =         "1996",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:23 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/0898122196873494",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Misc{Anonymous:1996:IPP,
  author =       "Anonymous",
  title =        "An Introduction to {PVM} Programming",
  howpublished = "World-Wide Web",
  year =         "1996",
  bibdate =      "Tue Jan 16 08:17:36 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/pvm/intro.html",
}

@Misc{Anonymous:1996:PPA,
  author =       "Anonymous",
  title =        "Porting {PVM} Applications to the {Intel Paragon}",
  howpublished = "World-Wide Web",
  year =         "1996",
  bibdate =      "Tue Jan 16 08:25:19 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.ccs.ornl.gov/news/guide/xps_pvm.html",
}

@Misc{Anonymous:1996:RP,
  author =       "Anonymous",
  title =        "Research Program",
  howpublished = "World-Wide Web",
  year =         "1996",
  bibdate =      "Tue Jan 16 08:26:39 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/networking/",
}

@InProceedings{Arbenz:1996:MDS,
  author =       "P. Arbenz and M. Billeter and P. G{\"u}ntert and P.
                 Luginb{\"u}hl and M. Taufer and U. {von Matt}",
  title =        "Molecular dynamics simulations on {Cray} clusters
                 using the {SCIDDLE-PVM} environment",
  crossref =     "Bode:1996:PVM",
  pages =        "142--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A0260 (Numerical approximation and analysis); A0270
                 (Computational techniques); A0320 (Classical mechanics
                 of discrete systems: general mathematical aspects);
                 A6120J (Computer simulation of static and dynamic
                 liquid behaviour); A8715H (Biomolecular dynamics,
                 molecular probes, molecular pattern recognition);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems software); C7320 (Physics and chemistry
                 computing); C7330 (Biology and medical computing)",
  corpsource =   "Inst. of Sci. Comput., Swiss Federal Inst. of
                 Technol., Zurich, Switzerland",
  keywords =     "acids; asynchronous remote procedure calls; atom
                 trajectory computation; biochemistry; biology
                 computing; classical mechanics; client-server;
                 communication; computer simulation; computing; Cray
                 clusters; Cray computers; digital simulation;
                 distributed algorithms; energy minimization;
                 environment; minimisation; molecular biophysics;
                 molecular dynamics method; molecular dynamics
                 simulations; Newtonian equations of motion; nucleic;
                 OPAL; paradigm; parallelization; physics; primitive;
                 proteins; SCIDDLE-PVM; software package; virtual
                 machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Arbenz:1996:SRP,
  author =       "P. Arbenz and W. Gander and H. P. L{\"u}thi and U.
                 {von Matt}",
  title =        "{Sciddle} 4.0, or, remote procedure calls in {PVM}",
  crossref =     "Liddell:1996:HPC",
  pages =        "820--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems software)",
  corpsource =   "Inst. of Sci. Comput., Swiss Federal Inst. of
                 Technol., Zurich, Switzerland",
  keywords =     "client process; client-server systems; data transfers;
                 explicit; large data sets; message passing; overhead;
                 parallel processing; parallelism; processes; remote
                 procedure calls; Sciddle 4.0; server; tree structure",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Attiya:1996:ERS,
  author =       "H. Attiya",
  title =        "Efficient and Robust Sharing of Memory in
                 Message-Passing Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1151",
  pages =        "56--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bachem:1996:STH,
  author =       "A. Bachem and Hochst{\"a}ttler and M. Malich",
  title =        "The Simulated Trading Heuristic for Solving Vehicle
                 Routing Problems",
  journal =      j-DISCRETE-APPL-MATH,
  volume =       "65",
  number =       "1-3",
  institution =  "Mathematisches Institut, Universit{\"a}t zu K{\"o}ln",
  address =      "Weyertal 86-90, 50931 K{\"o}ln, Germany",
  pages =        "47--72",
  month =        "????",
  year =         "1996",
  CODEN =        "DAMADU",
  ISSN =         "0166-218X (print), 1872-6771 (electronic)",
  ISSN-L =       "0166-218X",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Techreports/ZPR.Koeln.bib",
  annote =       "We present an improvement heuristic for vehicle
                 routing problems. The heuristic finds complex customer
                 interchanges to improve an initial solution. Our
                 approach is modular, thus it is easily adjusted to
                 different side constraints such as time windows,
                 backhauls and a heterogeneous vehicle fleet. The
                 algorithm is well suited for parallelization. We report
                 on a parallel implementation of the Simulated Trading
                 heuristic on a cluster of workstations using PVM. The
                 computational results obtained with sequential and
                 parallel Simulated Trading show that our approach is
                 competitive compared to all heuristics known to the
                 authors by now.",
  crindex =      "120k,29,zpr93-139.ps.gz",
  fjournal =     "Discrete Applied Mathematics",
  xxnote =       "Check final page number??",
}

@Article{Bader:1996:PPA,
  author =       "David A. Bader and David R. Helman and Joseph
                 J{\'a}J{\'a}",
  title =        "Practical parallel algorithms for personalized
                 communication and integer sorting",
  journal =      j-ACM-J-EXP-ALGORITHMICS,
  volume =       "1",
  pages =        "3:1--3:??",
  month =        "????",
  year =         "1996",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/235141.235148",
  ISSN =         "1084-6654",
  bibdate =      "Mon Oct 6 16:01:58 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "A fundamental challenge for parallel computing is to
                 obtain high-level, architecture independent, algorithms
                 which efficiently execute on general-purpose parallel
                 machines. With the emergence of message passing
                 standards such as MPI, it has become easier to design
                 efficient and portable parallel algorithms by making
                 use of these communication primitives. While existing
                 primitives allow an assortment of collective
                 communication routines, they do not handle an important
                 communication event when most or all processors have
                 non-uniformly sized personalized messages to exchange
                 with each other. We focus in this paper on the
                 h-relation personalized communication whose efficient
                 implementation will allow high performance
                 implementations of a large class of algorithms. While
                 most previous h-relation algorithms use randomization,
                 this paper presents a new deterministic approach for
                 h-relation personalized communication with
                 asymptotically optimal complexity for h>p$^2$. As an
                 application, we present an efficient algorithm for
                 stable integer sorting. The algorithms presented in
                 this paper have been coded in Split-C and run on a
                 variety of platforms, including the Thinking Machines
                 CM-5, IBM SP-1 and SP-2, Cray Research T3D, Meiko
                 Scientific CS-2, and the Intel Paragon. Our
                 experimental results are consistent with the
                 theoretical analysis and illustrate the scalability and
                 efficiency of our algorithms across different
                 platforms. In fact, they seem to outperform all similar
                 algorithms known to the authors on these platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal of Experimental Algorithmics",
}

@InProceedings{Barak:1996:PPM,
  author =       "A. Barak and A. Braverman and I. Gilderman and O.
                 Laden",
  title =        "Performance of {PVM} with the {MOSIX} preemptive
                 process migration scheme",
  crossref =     "IEEE:1996:PSI",
  pages =        "38--45",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6115 (Programming support); C6150J
                 (Operating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Inst. of Comput. Sci., Hebrew Univ., Jerusalem,
                 Israel",
  keywords =     "allocation; assignments; communication bound
                 benchmarks; computing; CPU benchmarks; high
                 performance; idle workstation use; load-balancing;
                 local area networks; MOSIX multicomputer operating
                 system; MOSIX preemptive process migration scheme;
                 multi-tasking applications; multiprogramming; network
                 operating; operating systems (computers); parallel
                 algorithms; parallel computing; parallel programming;
                 process migration; process migration algorithms;
                 programming environments; PVM performance; resource;
                 software performance evaluation; static process
                 assignment; system utilization; systems; task;
                 transparent; UNIX; Unix; workstation networks;
                 workstations",
  sponsororg =   "IEEE Computer. Soc., Israel Chapter",
  treatment =    "P Practical",
}

@InProceedings{Beguelin:1996:TMD,
  author =       "A. Beguelin and V. Sunderam",
  title =        "Tools for monitoring, debugging, and programming in
                 {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "7--13",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6140D (High level languages); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems)",
  corpsource =   "Carnegie Mellon Univ., Pittsburgh, PA, USA",
  keywords =     "authoring languages; buffered tracing; data
                 visualisation; debugging tools; graphical console; Java
                 language; JavaPVM; JPVM; languages; object-oriented;
                 ParaGraph visualization tool; parallel programming;
                 Parallel Virtual Machine; PGPVM; PIOUS; program
                 debugging; program monitoring tools; program tracing;
                 programming; PVaniM; PVM; PVMRPC; remote procedure
                 style; sampling; software; system monitoring; TCL;
                 techniques; tkPVM; tools; virtual machines; XPVM",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Bernaschi:1996:RHP,
  author =       "Massimo Bernaschi",
  title =        "The requirements of a high performance implementation
                 of {PVM}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "12",
  number =       "1",
  pages =        "3--11",
  month =        may,
  year =         "1996",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jul 15 09:06:07 MDT 2005",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Parallel/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6150N (Distributed systems
                 software); C7430 (Computer engineering)",
  corpsource =   "IBM Eur. Center for Sci. and Eng. Comput., Rome,
                 Italy",
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "AIX operating system; de facto standard; distributed
                 computing; high performance implementation; IBM
                 parallel; machine; message; message passing; parallel
                 machines; parallel virtual; passing; performance
                 evaluation; POWER 2 architecture; programming
                 interface; PVM; PVMe; run-time; SP2; system; system
                 support; virtual machines",
  pubcountry =   "Netherlands",
  remark =       "Resource Management in Distributed Systems",
  treatment =    "P Practical",
}

@InProceedings{Bhandarkar:1996:MPM,
  author =       "M. A. Bhandarkar and L. V. Kale",
  title =        "{MICE}: a prototype {MPI} implementation in {Converse}
                 environment",
  crossref =     "IEEE:1996:PSM",
  pages =        "26--31",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150E (General utility programs); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
                 USA",
  keywords =     "Abstract Device Interface; application program
                 interfaces; communication; computations; Converse
                 interoperable parallel programming environment; message
                 managers; message passing; MICE; MPI modules; MPICH;
                 multi-threaded MPI programs; open systems; parallel
                 programming; programming environments; prototype MPI
                 implementation; public-domain MPI implementation; PVM
                 interoperation; thread objects; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Blaszczyk:1996:EPI,
  author =       "A. Blaszczyk and C. Trinitis",
  title =        "Experience with {PVM} in an industrial environment",
  crossref =     "Bode:1996:PVM",
  pages =        "174--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B5110 (Electrostatics); B8300 (Power apparatus and
                 electric machines); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C6155 (Computer
                 communications software); C7410 (Electrical engineering
                 computing); C7430 (Computer engineering)",
  corpsource =   "Asea Brown Boveri AG, Heidelberg, Germany",
  keywords =     "3D; Asea Brown Boveri; CAD; cluster; code; computer
                 communications software; configuration; ease of use;
                 efficiency; electric fields; electrical engineering
                 computing; heterogeneous workstation clusters;
                 high-voltage engineering; high-voltage equipment;
                 industrial environment; multiprocessor machines;
                 parallel code; parallel programming; Parallel Virtual
                 Machine; parallelization; PVM communication software;
                 reliability; simulation; virtual machines",
  pubcountry =   "Germany",
  treatment =    "A Application",
}

@InProceedings{Blum:1996:PIP,
  author =       "J. M. Blum and T. M. Warschko and W. F. Tichy",
  title =        "{PSPVM}: implementing {PVM} on a high-speed
                 interconnect for workstation clusters",
  crossref =     "Bode:1996:PVM",
  pages =        "235--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5620L (Local area
                 networks); C6150N (Distributed systems software); C6180
                 (User interfaces)",
  corpsource =   "Dept. of Inf., Karlsruhe Univ., Germany",
  keywords =     "25 mus; application speed-up; code compatibility;
                 exchange; latency; local area networks; message;
                 message passing; message transmission; multiprocessing;
                 object-; parallel machines; ParaStation high-speed
                 interconnect; ParaStation user interface; PSPVM; PVM
                 package; systems; throughput; user interfaces; user
                 level communication; user-level socket emulation;
                 workstation clusters; workstations",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Bonnet:1996:UPW,
  author =       "C. Bonnet",
  title =        "Using {PVM} in wireless network environments",
  crossref =     "Bode:1996:PVM",
  pages =        "296--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6210L (Computer communications); C5470 (Performance
                 evaluation and testing); C5620L (Local area networks);
                 C5670 (Network performance)",
  corpsource =   "Inst. Eurecom, Sophia Antipolis, France",
  keywords =     "message passing model; networked environments;
                 parallel machines; parallel virtual machine;
                 performance evaluation; PVM; virtual machines; wireless
                 LAN; wireless local area network; WLAN",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Bouchard:1996:FCS,
  author =       "V. Bouchard and P. Cinquin and L. Desbat",
  title =        "First {Compton} scatter correction in {SPECT} using
                 {PVM}",
  crossref =     "Grangeat:1996:PTI",
  pages =        "109--111",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A8710 (General, theoretical, and mathematical
                 biophysics); A8760K (Nuclear medicine, emission
                 tomography); A8770E (Patient diagnostic methods and
                 instrumentation); B6140C (Optical information, image
                 and video signal processing); B7510B (Radiation and
                 radioactivity applications in biomedicine); C4240P
                 (Parallel programming and algorithm theory); C5260B
                 (Computer vision and image processing techniques);
                 C7330 (Biology and medical computing)",
  corpsource =   "Fac. de Med., TIMC-IMAG, La Tronche, France",
  keywords =     "3D algorithms; Compton effect; computed tomography;
                 diagnostic imaging; first Compton interaction;
                 gamma-ray scattering; Klein-Nishina formula; medical;
                 medical image; modeling; nuclear medicine; parallel
                 algorithms; parallel virtual machine; physical;
                 processing; registered scanner reconstruction; single
                 photon emission; SPECT Compton scatter correction",
  pubcountry =   "France",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Brightwell:1996:DIM,
  author =       "R. Brightwell and L. Shuler",
  title =        "Design and implementation of {MPI} on {Puma} portals",
  crossref =     "IEEE:1996:PSM",
  pages =        "18--25",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6110B (Software engineering
                 techniques); C6150E (General utility programs); C6150J
                 (Operating systems); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Massively Parallel Comput. Res. Lab., Sandia Nat.
                 Labs., Albuquerque, NM, USA",
  keywords =     "application program interfaces; Argonne National
                 Laboratory/Mississippi State University Message Passing
                 Interface standard implementation; high performance
                 message passing environment; Intel Paragon; Intel
                 TeraFLOPS machine; massively parallel computers;
                 message passing; MPI; MPI collective communication; MPI
                 point- to-point communications; MPI-2 one-sided
                 communications; network operating systems; operating
                 systems (computers); parallel architectures; parallel
                 machines; Puma operating system; Puma portals; software
                 portability; SUNMOS; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Bubak:1996:MPP,
  author =       "M. Bubak and W. Funika and J. Moscinski",
  title =        "Monitoring of performance of {PVM} applications on
                 virtual network computer",
  crossref =     "Wasniewski:1996:APC",
  pages =        "147--156",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6150G (Diagnostic, testing, debugging
                 and evaluating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Inst. of Comput. Sci., AGH, Krakow, Poland",
  keywords =     "computer; data visualisation; metaformat; metrics;
                 monitoring; Pablo-based tool; parallel machines;
                 parallel programming; parallel programs; performance
                 monitoring; PVM applications; SDDF; software
                 performance evaluation; software tools; system;
                 Tape/PVM; toolkit; virtual machines; virtual network;
                 visualization",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Bubak:1996:PBP,
  author =       "M. Bubak and W. Funika and J. Moscinski and D. Tasak",
  title =        "Pablo-based performance monitoring tool for {PVM}
                 applications",
  crossref =     "Dongarra:1996:APC",
  pages =        "69--78",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems)",
  corpsource =   "Inst. of Comput. Sci., AGH, Krakow, Poland",
  keywords =     "3-D molecular dynamics program; conjugate gradient
                 benchmark; Pablo environment; ParaGraph functions;
                 parallel programming; performance monitoring; PVM
                 applications; software performance evaluation; system
                 monitoring; trace file; XPVM",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Bubak:1996:PPM,
  author =       "M. Bubak and W. Funika and J. Moscinski and D. Tasak",
  title =        "{Pablo-Based} Performance Monitoring Tool for {PVM}
                 Applications",
  crossref =     "Dongarra:1996:APC",
  pages =        "69--78",
  year =         "1996",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Cavenaghi:1996:UPS,
  author =       "M. A. Cavenaghi and R. Spolon and J. E. M.
                 Perea-Martins and S. G. Domingues and A. {Garcia
                 Neto}",
  title =        "Using {PVM} in the simulation of a hybrid dataflow
                 architecture",
  crossref =     "Bode:1996:PVM",
  pages =        "343--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C5440 (Multiprocessing
                 systems); C6185 (Simulation techniques); C7430
                 (Computer engineering)",
  corpsource =   "Dept. of Comput. Sci., Sao Paulo State Univ., Brazil",
  keywords =     "data flow computing; digital simulation; hybrid
                 dataflow architecture; interconnection network;
                 machines; message passing; message passing environment;
                 multiplexing; multiprocessor system; optical; optical
                 interconnections; parallel architectures; parallel
                 execution; sequential execution; simulator; system;
                 uniprocessor; virtual; wavelength division; wavelength
                 division multiplexing; WDM techniques",
  pubcountry =   "Germany",
  treatment =    "A Application; P Practical",
}

@Article{Charny:1996:MPV,
  author =       "B. Charny",
  title =        "Matrix partitioning on a virtual shared memory
                 parallel machine",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "7",
  number =       "4",
  pages =        "343--355",
  month =        apr,
  year =         "1996",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.494629",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4140 (Linear algebra); C4240P (Parallel programming
                 and algorithm theory); C5220P (Parallel architecture)",
  corpsource =   "Audre Inc., San Diego, CA, USA",
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
  keywords =     "contention-; free partitionings; load-balanced;
                 machines; matrix; matrix decomposition; memory
                 contention; parallel; parallel algorithms; parallel
                 machine; partitioning; performance issues; shared
                 memory systems; virtual shared memory; virtual
                 storage",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Chengqing:1996:WIP,
  author =       "Ye Chengqing and Cui Zhenqian",
  title =        "The ways of improving parallel computing efficiency in
                 {PVM}",
  journal =      j-MINI-MICRO-SYSTEMS,
  volume =       "17",
  number =       "4",
  pages =        "12--16",
  month =        apr,
  year =         "1996",
  CODEN =        "XWJXEH",
  ISSN =         "1000-1220",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6185 (Simulation
                 techniques); C7430 (Computer engineering)C6150J
                 (Operating systems)",
  corpsource =   "State Key Lab. of CAD/CG, Zhejiang Univ., Hangzhou,
                 China",
  fjournal =     "Mini-Micro Systems",
  keywords =     "balancing algorithm; communication overhead; digital
                 simulation; distributed system environment; hosts;
                 load; local area network; message exchange; network
                 partitioning; parallel; parallel computing efficiency;
                 parallel machines; PVM; resource allocation; strategy;
                 virtual machine; virtual machines",
  language =     "Chinese",
  pubcountry =   "China",
  treatment =    "P Practical",
}

@Article{Ciampolini:1996:EPM,
  author =       "A. Ciampolini and C. Stefanelli",
  title =        "Extending {PVM} to a massively parallel architecture",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "12",
  number =       "1",
  pages =        "13--23",
  month =        may,
  year =         "1996",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jul 15 09:06:07 MDT 2005",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Parallel/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  classification = "C1250 (Pattern recognition); C5220P (Parallel
                 architecture); C5260B (Computer vision and image
                 processing techniques); C5440 (Multiprocessing
                 systems); C6115 (Programming support); C7430 (Computer
                 engineering)",
  corpsource =   "Dipartimento di Elettronica, Inf. e Sistemistica,
                 Bologna Univ., Italy",
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "applications; architecture; computational vision
                 application; computer vision; fine-grained parallel;
                 heterogeneous computing; machines; massively parallel
                 architecture; Meiko Computing Surface; multicomputer;
                 parallel; parallel architectures; parallel machines;
                 programming environment; programming environments;
                 transputer technology; Unix workstations; virtual
                 machines",
  pubcountry =   "Netherlands",
  remark =       "Resource Management in Distributed Systems",
  treatment =    "A Application; P Practical",
}

@InProceedings{Clematis:1996:CEP,
  author =       "A. Clematis and V. Gianuzzi",
  title =        "{CPVM} --- extending {PVM} for consistent
                 checkpointing",
  crossref =     "IEEE:1996:PFE",
  pages =        "67--76",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  corpsource =   "Istituto per la Matematica Applicata, CNR, Genova,
                 Italy",
  keywords =     "concurrency control; consistent checkpointing; CPVM;
                 deadlocks; fault-tolerance; global checkpoint-restart
                 algorithms; job-swapping; migration; nonblocking;
                 parallel programming; Parallel Virtual Machine; PVM;
                 software; software fault; software libraries; software
                 library; software portability; software tools;
                 termination; tolerance",
  treatment =    "P Practical",
}

@InProceedings{Clemencon:1996:THM,
  author =       "C. Clemencon and K. M. Decker and V. R. Deshpande and
                 A. Endo and J. Fritscher and P. A. R. Lorenzo and N.
                 Masuda and A. Muller and R. Ruhl and W. Sawyer and B.
                 J. N. Wylie and F. Zimmermann",
  title =        "Tools-supported {HPF} and {MPI} parallelization of the
                 {NAS} parallel benchmarks",
  crossref =     "IEEE:1996:FSS",
  pages =        "309--318",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6140D (High level languages); C6150C
                 (Compilers, interpreters and other processors); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems)",
  conftitle =    "Proceedings of 6th Symposium on the Frontiers of
                 Massively Parallel Computation (Frontiers '96)",
  corpsource =   "Centro Svizzero di Calcolo Sci., Manno, Switzerland",
  keywords =     "Annai tool; code development time; communication
                 libraries; compilers; distributed memory systems;
                 FORTRAN; High Performance Fortran; high-level language;
                 message passing; Message Passing Interface; NAS
                 parallel benchmarks; NEC Cenju-3 distributed-memory
                 parallel processor; parallel benchmark kernels;
                 parallel languages; parallel programming; performance;
                 portable parallel applications; program compilers;
                 program debugging; scalability; scientific
                 applications; sequential languages; software libraries;
                 software performance evaluation; software tools",
  sponsororg =   "IEEE Comput. Soc.; NASA Goddard Space Flight Center;
                 URSA/CESDIS",
  treatment =    "P Practical",
}

@InProceedings{Clement:1996:NPM,
  author =       "Mark J. Clement and Michael R. Steed and Phyllis E.
                 Crandall",
  title =        "Network Performance Modeling for {PVM} Clusters",
  crossref =     "ACM:1996:SCP",
  pages =        "??--??",
  year =         "1996",
  bibdate =      "Mon Mar 23 12:31:18 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.supercomp.org/sc96/proceedings/SC96PROC/CLEMENT/INDEX.HTM",
  acknowledgement = ack-nhfb,
}

@Article{Conforti:1996:PIA,
  author =       "D. Conforti and L. {de Luca} and L. Grandinetti and R.
                 Musmanno",
  title =        "A parallel implementation of automatic differentiation
                 for partially separable functions using {PVM}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "5",
  pages =        "643--656",
  day =          "8",
  month =        aug,
  year =         "1996",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:14:59 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1996&volume=22&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&volume=22&issue=5&aid=1065",
  acknowledgement = ack-nhfb,
  classification = "B0290M (Numerical integration and differentiation);
                 C4160 (Numerical integration and differentiation);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "Dipartimento di Elettronica, Inf. e Sistemistica,
                 Calabria Univ., Italy",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "automatic differentiation; differentiation;
                 distributed memory; finite-difference approximation;
                 multiprocessor system; parallel algorithms; parallel
                 implementation; partially separable functions; PVM;
                 substantial speed-up",
  pubcountry =   "Netherlands",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Corbett:1996:OMP,
  author =       "P. Corbett and D. Feitelson and S. Fineberg and Yarsun
                 Hsu and B. Nitzberg and J.-P. Prost and M. Snir and B.
                 Traversat and Parkson Wong",
  title =        "Overview of the {MPI-IO} parallel {I/O} interface",
  crossref =     "Jain:1996:IOP",
  pages =        "127--146",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6150N (Distributed systems software);
                 C6180 (User interfaces)",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  keywords =     "asynchronous I/O operations; collective interface;
                 data structures; file data partitioning; global data
                 structures; high-level interface; message passing;
                 MPI-IO parallel I/O interface; parallel file systems;
                 parallel machine; parallel programming; portable
                 message passing parallel programs; process memories;
                 storage devices; user interfaces",
  treatment =    "A Application; P Practical",
}

@InProceedings{Cotronis:1996:ECP,
  author =       "J. Y. Cotronis and E. Floros and N. Papazis",
  title =        "Efficient composition of {PVM} programs",
  crossref =     "Liddell:1996:HPC",
  pages =        "919--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming)",
  corpsource =   "Dept. of Inf., Athens Univ., Greece",
  keywords =     "communication; Distribution of Maximum; parallel
                 programming; process algebra; PVM; PVM programs;
                 terminal process; topologies; tree process
                 communication",
  pubcountry =   "Germany",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Coulaud:1996:EIP,
  author =       "O. Coulaud and E. Dillon",
  title =        "Early implementation of {Para++} with {MPI-2}",
  crossref =     "IEEE:1996:PSM",
  pages =        "95--101",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150E (General utility programs); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Inst. Nat. de Recherche en Inf. et Autom.,
                 Villers-les- Nancy, France",
  keywords =     "application program interfaces; C language; dynamic
                 process chapter; dynamic process management; early
                 implementation; inter-communicator operations; internal
                 implementation; LAM 6.0; message passing; Message
                 Passing Interface; MPI-2; Para++ 2.0; PVM; software
                 libraries",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Dantas:1996:ILB,
  author =       "M. A. R. Dantas and E. J. Zaluska",
  title =        "Improving load balancing in an {MPI} environment with
                 resource management",
  crossref =     "Liddell:1996:HPC",
  pages =        "959--960",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  conftitle =    "High-Performance Computing and Networking.
                 International Conference and Exhibition HPCN Europe
                 1996",
  corpsource =   "Dept. of Electron. and Comput. Sci., Southampton
                 Univ., UK",
  keywords =     "load balancing; local area networks; message passing;
                 Message Passing Interface; MPI environment; parallel
                 programming; process migration; programming
                 environments; resource allocation; resource management
                 facility; workstation clusters; workstations",
  treatment =    "P Practical",
}

@InProceedings{Demaine:1996:FCC,
  author =       "E. Demaine",
  title =        "First class communication in {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "189--194",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6140D (High level languages); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Waterloo Univ., Ont., Canada",
  keywords =     "application program interfaces; C; channel creation;
                 communication events; Concurrent ML;
                 concurrent-programming languages; dynamic process
                 creation; Fortran; higher-order concurrency; message
                 passing; Message Passing Interface; message-passing;
                 MPI; Occam; parallel languages; parallel programming;
                 run- time; software libraries; software standards;
                 standard; Standard ML; static model",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Deshpande:1996:MIBa,
  author =       "V. Deshpande and W. Sawyer and D. W. Walker",
  title =        "An {MPI} implementation of the {BLACS}",
  crossref =     "IEEE:1996:PSM",
  pages =        "195--198",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4140 (Linear algebra); C5220P (Parallel
                 architecture); C6110B (Software engineering
                 techniques); C6115 (Programming support); C6150E
                 (General utility programs); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Swiss Center for Sci. Comput., Manno, Switzerland",
  keywords =     "application program interfaces; Basic Linear
                 Communication Subprograms; BLACS; libraries; matrix
                 algebra; message passing; MPI BLACS implementation; MPI
                 functionality; MPI libraries; parallel architectures;
                 performance; software libraries; software performance
                 evaluation; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Deshpande:1996:MIBb,
  author =       "V. Deshpande and W. Sawyer",
  title =        "An {MPI} implementation of the {BLACS}",
  crossref =     "IEEE:1996:ICH",
  pages =        "463--468",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4140 (Linear algebra); C5440 (Multiprocessing
                 systems); C6150N (Distributed systems software); C7310
                 (Mathematics computing)",
  conftitle =    "Proceedings of 3rd International Conference on High
                 Performance Computing (HiPC)",
  corpsource =   "Software Technol. Group, Swiss Center for Sci.
                 Comput., Manno, Switzerland",
  keywords =     "Basic Linear Algebra Communication Subprograms;
                 benchmark; BLACS; factorization; linear algebra;
                 mathematics computing; message passing; Message Passing
                 Interface; MPI implementation; parallel architectures;
                 performance; ScaLAPACK library; software libraries;
                 software packages; software performance evaluation",
  sponsororg =   "IEEE Comput. Soc.; IEEE Comput. Soc. Tech. Committee
                 on Parallel Process.; ACM SIGARCH",
  treatment =    "P Practical",
}

@InProceedings{Dinda:1996:PIA,
  author =       "P. A. Dinda and D. R. O'Hallaron",
  title =        "The performance impact of address relation caching",
  crossref =     "Szymanski:1996:LCR",
  pages =        "213--226",
  year =         "1996",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  keywords =     "Address computation; Address relation caching; Cache;
                 Critical path; Data transfer; Deposit model
                 communication; Distributed programming; End-to-end
                 latency; Fine grain analytic model; Memory bandwidth;
                 Message passing; Parallel programming; Performance
                 impact",
  thesaurus =    "Cache storage; Distributed processing; Message
                 passing; Parallel programming",
}

@InProceedings{DiNucci:1996:CDS,
  author =       "D. C. DiNucci",
  title =        "Cooperative Data Sharing: a layered approach to an
                 architecture-independent {Message-Passing Interface}",
  crossref =     "IEEE:1996:PSM",
  pages =        "58--65",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6150E (General
                 utility programs); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "NASA Ames Res. Center, Moffett Field, CA, USA",
  keywords =     "application development; application program
                 interfaces; architecture-independent message-passing
                 interface; CDS1; CDS2; communication semantics;
                 contiguous data; Cooperative Data Sharing System; local
                 area networks; low-level portable interface; message
                 passing; Message Passing Kernel project; MPI; network
                 operating systems; one-sided communication; operating
                 system kernels; queues; semantics; SGI Power Challenge
                 Array; Solaris; Sun workstation network; utility
                 programs; workstations",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Djordjevic:1996:ICI,
  author =       "G. L. Djordjevic and M. K. Stojcev",
  title =        "An interprocessor communication interface for message
                 passing via shared memory modules-design and
                 performances",
  journal =      j-COMP-ART-INTELL,
  volume =       "15",
  number =       "1",
  pages =        "1--34",
  month =        "????",
  year =         "1996",
  CODEN =        "CARIDY",
  ISSN =         "0232-0274",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5150 (Other circuits for digital computers); C5250
                 (Microcomputer techniques); C5430 (Microcomputers);
                 C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C5610S (System buses)",
  corpsource =   "Fac. of Electron. Eng., Nish, Yugoslavia",
  fjournal =     "Computers and Artificial Intelligence =
                 Vychislitel'nye mashiny i iskusstvennyi intellekt",
  keywords =     "communication bandwidth; communication module;
                 communication throughput; configuration flexibility;
                 data transfer; fully connected n-side pyramid;
                 heterogeneous processors; host computer accelerator;
                 interprocessor communication interface; local memory;
                 message latency; message passing; microcomputers;
                 multi-microcomputer system; multiprocessor
                 interconnection networks; performance evaluation;
                 shared memory bus; shared memory modules; shared memory
                 systems; simulation; single board computers; storage
                 management chips; system buses; system efficiency;
                 system operation; system topology; two-side accessible
                 memory chips",
  treatment =    "P Practical",
}

@Article{Dong:1996:SPL,
  author =       "Li Dong and Li Xiaoming and Fang Binxing",
  title =        "The study on the parallel library based on {MPI}",
  journal =      j-MINI-MICRO-SYSTEMS,
  volume =       "17",
  number =       "12",
  pages =        "17--19",
  year =         "1996",
  CODEN =        "XWJXEH",
  ISSN =         "1000-1220",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  corpsource =   "Harbin Inst. of Technol., China",
  fjournal =     "Mini-Micro Systems",
  keywords =     "MPI; parallel library; parallel programming; parallel
                 programming environments; software libraries;
                 workstation network",
  language =     "Chinese",
  treatment =    "P Practical",
}

@Article{Dongarra:1996:MPS,
  author =       "Jack J. Dongarra and Steve W. Otto and Marc Snir and
                 David Walker",
  title =        "A message passing standard for {MPP} and
                 workstations",
  journal =      j-CACM,
  volume =       "39",
  number =       "7",
  pages =        "84--90",
  month =        jul,
  year =         "1996",
  CODEN =        "CACMA2",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Mon Aug 26 07:42:43 MDT 1996",
  bibsource =    "Compendex database; http://www.acm.org/pubs/toc/;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/cacm/234000.html",
  abstract =     "The Message Passing Interface (MPI) is a portable
                 message-passing standard that facilitates development
                 of parallel applications and libraries. MPI has been
                 developed over a 12-month period in 1993 to 1994 of
                 intensive meetings involving more than 80 people from
                 approximately 40 organizations, mainly from the U.S.
                 and Europe. Programming in MPI is straightforward and
                 similar to programming with other message-passing
                 interfaces.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Tennessee",
  affiliationaddress = "Knoxville, TN, USA",
  classification = "716.1; 722.2; 722.3; 722.4; 723.1; 902.2",
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
  journalabr =   "Commun ACM",
  keywords =     "algorithms; Application programming interface; C
                 (programming language); Communication library routines;
                 Computer networks; Computer software; Computer systems
                 programming; Computer workstations; Concurrency
                 control; Concurrent programs; Data communication
                 systems; design; FORTRAN (programming language);
                 Interfaces (computer); languages; Massively parallel
                 processing; Message passing interface; Message passing
                 programs; Message passing standard; Networks of
                 workstations; Parallel processing systems; Point to
                 point communications; Program compilers;
                 standardization; Standards; Subroutines",
  subject =      "{\bf D.4.4}: Software, OPERATING SYSTEMS,
                 Communications Management, Message sending. {\bf
                 D.2.7}: Software, SOFTWARE ENGINEERING, Distribution
                 and Maintenance, Portability. {\bf D.2.0}: Software,
                 SOFTWARE ENGINEERING, General, Standards. {\bf D.2.2}:
                 Software, SOFTWARE ENGINEERING, Tools and Techniques,
                 Software libraries. {\bf D.3.2}: Software, PROGRAMMING
                 LANGUAGES, Language Classifications, Concurrent,
                 distributed, and parallel languages. {\bf D.1.3}:
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming.",
}

@InProceedings{Dongarra:1996:SRP,
  author =       "J. J. Dongarra and T. Hey and E. Strohmaier",
  title =        "Selected results from the {PARKBENCH} benchmark",
  crossref =     "Bouge:1996:EPP",
  volume =       "2",
  pages =        "251--254",
  year =         "1996",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing); C6150G (Diagnostic, testing,
                 debugging and evaluating systems)",
  conflocation = "Lyon, France; 26-29 Aug. 1996",
  conftitle =    "Proceedings of European Conference on Parallel
                 Processing EURO-PAR '96",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "computer testing; evaluation; hierarchical; MPI;
                 parallel architectures; parallel benchmarks; PARKBENCH
                 benchmark; performance; PVM; suite",
  treatment =    "P Practical",
}

@InProceedings{Ebner:1996:TFP,
  author =       "R. Ebner and A. Pfaffinger",
  title =        "Transformation of functional programs into data flow
                 graphs implemented with {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "251--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4190 (Other numerical methods); C4210L (Formal
                 languages and computational linguistics); C5620L (Local
                 area networks); C6110P (Parallel programming); C6115
                 (Programming support); C6120 (File organisation);
                 C6140D (High level languages); C6150C (Compilers,
                 interpreters and other processors); C6150N (Distributed
                 systems software)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  keywords =     "algorithms; automatic coarse-grain program; C
                 procedure generation; communication; compiler;
                 compilers; computational linguistics; data flow; data
                 flow graphs; data structures; distributed tree-like
                 data structures; dynamic data; FASAN; FASAN schedulers;
                 function node evaluation; functional; functional
                 language; functional program transformation; functional
                 programming; inherent parallelism; languages; local
                 area networks; maximal; numerical analysis; parallel
                 programming; parallelising; parallelization; processor
                 scheduling; PVM library; recursive numerical;
                 semantics; software libraries; stream flow semantics;
                 structure; tree; workstation clusters; workstations;
                 wrapper streams",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Fabero:1996:DLB,
  author =       "J. C. Fabero and I. Martin and A. Bautista and S.
                 Molina",
  title =        "Dynamic load balancing in a heterogeneous environment
                 under {PVM}",
  crossref =     "IEEE:1996:PFE",
  pages =        "414--419",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  corpsource =   "Dept. de Inf. y Autom., Univ. Complutense de Madrid,
                 Spain",
  keywords =     "allocation; computational load; computational
                 requirements; computer aided software engineering;
                 dynamic load balancing; heterogeneous environment;
                 heterogeneous workstations net; parallel algorithms;
                 processor scheduling; programming environments;
                 resource; virtual storage",
  treatment =    "P Practical",
}

@Article{Fagg:1996:PIP,
  author =       "Graham Fagg and Jack Dongarra",
  title =        "{PVMPI}: An Integration of {PVM} and {MPI} Systems",
  journal =      "Calculateurs Parall{\`e}les",
  volume =       "8",
  number =       "2",
  pages =        "151--166",
  year =         "1996",
  CODEN =        "????",
  ISSN =         "1260-3198",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/utk/papers/pvmpi/paper.html;
                 http://www.netlib.org/utk/papers/pvmpi/pvmpi.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/pvmpi.pdf",
  acknowledgement = ack-nhfb,
}

@InProceedings{Fagg:1996:TGR,
  author =       "G. E. Fagg and K. S. London and J. J. Dongarra",
  title =        "Taskers and general resource managers: {PVM}
                 supporting {DCE} process management",
  crossref =     "Bode:1996:PVM",
  pages =        "180--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150E (General utility programs); C6150N
                 (Distributed systems software); C7430 (Computer
                 engineering)",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "allocation schemes; application program interfaces;
                 DCE process; distributed algorithms; distributed
                 computing environments; dynamic meta-computing
                 environments; general resource managers; management;
                 Message; message passing; MPI; MPIRUN systems;
                 operations; Parallel Virtual Machine; Passing
                 Interface; processor scheduling; PVM 3.4 release; PVM
                 internal; PVMPI project; resource allocation;
                 schedulers; standardised plug-in; taskers;
                 user-controlled flexibility; virtual machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Fang:1996:SPP,
  author =       "N. Fang and H. Burkhart",
  title =        "Structured parallel programming using {MPI}",
  crossref =     "Liddell:1996:HPC",
  pages =        "840--847",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  conftitle =    "High-Performance Computing and Networking.
                 International Conference and Exhibition HPCN Europe
                 1996",
  corpsource =   "Dept. of Inf., Basel Univ., Switzerland",
  keywords =     "higher abstractions; higher-level functions; message
                 passing; message passing interface; message-passing
                 programs; parallel programming; portability;
                 programmer-oriented abstractions; programming
                 environment; programming environments; structured
                 parallel programming; system-oriented level",
  treatment =    "P Practical",
}

@InProceedings{Fineberg:1996:PPI,
  author =       "S. A. Fineberg and P. Wong and B. Nitzberg and C.
                 Kuszmaul",
  title =        "{PMPIO-a} portable implementation of {MPI-IO}",
  crossref =     "IEEE:1996:FSS",
  pages =        "188--195",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6150J (Operating systems);
                 C6150N (Distributed systems software)",
  conftitle =    "Proceedings of 6th Symposium on the Frontiers of
                 Massively Parallel Computation (Frontiers '96)",
  corpsource =   "Numerical Aerodynamic Simulation, NASA Ames Res.
                 Center, Moffett Field, CA, USA",
  keywords =     "Cray J90; IBM SP-2; input-output programs; Intel
                 Paragon; message passing; message passing interface;
                 MPI-IO; parallel programming; PMPIO; portable I/O
                 interface; portable implementation; portable parallel
                 Input/Output interface; portable parallel programming;
                 SGI; software engineering; software portability; Sun
                 shared memory workstations",
  sponsororg =   "IEEE Comput. Soc.; NASA Goddard Space Flight Center;
                 URSA/CESDIS",
  treatment =    "P Practical",
}

@InProceedings{Foster:1996:CDT,
  author =       "I. T. Foster and D. R. {Kohr, Jr.} and R. Krishnaiyer
                 and Choudhary and A.",
  title =        "Communicating data-parallel tasks: an {MPI} library
                 for {HPF}",
  crossref =     "IEEE:1996:ICH",
  pages =        "433--438",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages)",
  conftitle =    "Proceedings of 3rd International Conference on High
                 Performance Computing (HiPC)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "data-parallel tasks; FORTRAN; High Performance
                 Fortran; HPF; HPF compiler; MPI library; multiblock
                 application; multidisciplinary simulations; parallel
                 programming; performance; pipeline computations;
                 software performance evaluation; synthetic
                 communication benchmark; task parallelism",
  sponsororg =   "IEEE Comput. Soc.; IEEE Comput. Soc. Tech. Committee
                 on Parallel Process.; ACM SIGARCH",
  treatment =    "P Practical",
}

@InProceedings{Foster:1996:DSB,
  author =       "Ian Foster and David R. {Kohr, Jr.} and Rakesh
                 Krishnaiyer and Alok Choudhary",
  title =        "Double Standards: Bringing Task Parallelism to {HPF}
                 Via the Message Passing Interface",
  crossref =     "ACM:1996:SCP",
  pages =        "??--??",
  year =         "1996",
  bibdate =      "Mon Mar 23 12:31:18 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.supercomp.org/sc96/proceedings/SC96PROC/FOSTER2/INDEX.HTM",
  acknowledgement = ack-nhfb,
}

@InProceedings{Foster:1996:GCM,
  author =       "I. Foster and C. Kesselman and M. Snir",
  title =        "Generalized communicators in the {Message Passing
                 Interface}",
  crossref =     "IEEE:1996:PSM",
  pages =        "42--49",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6150E (General utility
                 programs); C6150N (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "application program interfaces; collective
                 communication operations; dynamic endpoint creation;
                 dynamically created threads; endpoint transfer;
                 generalized communicator construct; generalized MPI
                 communicator concept; message passing; Message Passing
                 Interface; multiple communication endpoints; multiple
                 threads; object- oriented programming; object-oriented
                 applications; parallel programming; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Foster:1996:MCL,
  author =       "I. T. Foster and D. R. {Kohr, Jr.} and R.
                 Krishnaiyer",
  title =        "{MPI} as a coordination layer for communicating {HPF}
                 tasks",
  crossref =     "IEEE:1996:PSM",
  pages =        "68--78",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6140D (High level languages); C6150E (General utility
                 programs); C6150N (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "application kernel; application program interfaces;
                 binding; communication interface semantics;
                 communications microbenchmark; coordination library
                 calls; data parallelism; data-parallel languages;
                 distributed array; execution model; explicit message
                 passing; FORTRAN; High Performance Fortran task
                 communication; high- level operations; libraries;
                 library; message passing; Message Passing Interface;
                 MPI coordination layer; parallel languages; parallel
                 program development; parallel programming; performance
                 evaluation; prototype HPF/MPI library; sequential
                 languages; software libraries; software performance
                 evaluation; task parallelism; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Foster:1996:MIW,
  author =       "I. Foster and J. Geisler and S. Tuecke",
  title =        "{MPI} on the {I-WAY}: a wide-area, multimethod
                 implementation of the {Message Passing Interface}",
  crossref =     "IEEE:1996:PSM",
  pages =        "10--17",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620W (Other computer networks); C6110B (Software
                 engineering techniques); C6115 (Programming support);
                 C6130S (Data security); C6150E (General utility
                 programs); C6150N (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Argonne Nat. Lab., IL, USA",
  keywords =     "application program interfaces; authentication;
                 automatic configuration mechanisms; communication
                 mechanisms; geographically distributed computing
                 resources; geographically distributed database
                 resources; geographically distributed graphics
                 resources; geographically distributed networking;
                 heterogeneous systems; high-speed wide-area networks;
                 I-WAY distributed- computing experiment; message
                 authentication; message passing; Message Passing
                 Interface; MPICH; Nexus multithreaded runtime system;
                 parallel programming; portable high-performance
                 programming model; process creation; programming
                 environments; software environment; software libraries;
                 utility programs; wide area networks",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Geist:1996:APP,
  author =       "G. A. Geist",
  title =        "Advanced programming in {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "1--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "advanced programming; application performance;
                 applications; CUMULVS; distributed computing
                 applications; fault tolerance; interactive; JavaPVM;
                 message passing; parallel computing; parallel
                 programming; Parallel Virtual Machine; performance
                 evaluation; plug-ins; program debugging; PVM; software;
                 software fault tolerance; software packages; TkPVM;
                 virtual machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Geist:1996:MEM,
  author =       "A. Geist and W. Gropp and S. Huss-Lederman and A.
                 Lumsdaine and E. Lusk and W. Saphir and T. Skjellum and
                 M. Snir",
  title =        "{MPI-2}: extending the {Message-Passing Interface}",
  crossref =     "Bouge:1996:EPP",
  pages =        "128--135",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5610 (Computer
                 interfaces)",
  conftitle =    "Proceedings of European Conference on Parallel
                 Processing EURO-PAR '96",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "collective operations; computer interfaces; dynamic
                 process management; extensions; external interfaces;
                 language binding; message passing; Message Passing
                 Interface; MPI; MPI-2; MPI-2 document; one-sided
                 operations; real-time computing; standards",
  treatment =    "P Practical",
}

@TechReport{Geist:1996:VDP,
  author =       "G. A. Geist and James Kohn and Philip Papadopoulos",
  title =        "Visualization, Debugging, and Performance in {PVM}",
  institution =  inst-ORNL,
  address =      inst-ORNL:adr,
  pages =        "11",
  year =         "1996",
  bibdate =      "Tue Jan 16 08:22:10 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/~geist/CapeCod.ps",
}

@Article{Gennart:1996:CAG,
  author =       "B. A. Gennart and J. {Tarraga Gimenez} and R. D.
                 Hersch",
  title =        "Computer-Assisted Generation of {PVM\slash C++}
                 Programs Using {CAP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "259--269",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110F (Formal methods); C6110P (Parallel
                 programming); C6115 (Programming support); C6140D (High
                 level languages); C6150N (Distributed systems
                 software)",
  corpsource =   "Ecole Polytech. Federale de Lausanne, Switzerland",
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "algorithm parallelization; algorithms; automatic
                 programming; C language; C++; CAP; communication
                 library; computation description; Computer-Aided
                 Parallelization; computer-assisted; computer-assisted
                 C++ program generation; data transfer requirements;
                 formal specification; language extension; machine;
                 message exchange; message passing; MPMD program;
                 object-oriented languages; ordering; parallel; parallel
                 program writing; parallel programming; performance;
                 processors; PVM program generation; sequential code;
                 sequential operation; sequential operations;
                 specification; synchronisation; synchronization; thread
                 execution; thread mapping; threads",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Ghosh:1996:ELM,
  author =       "K. Ghosh and S. Breit",
  title =        "Evaluating the Limits of Message Passing via the
                 Shared Attraction Memory on {CC-COMA} Machines:
                 Experiences with {TCGMSG} and {PVM}",
  crossref =     "ACM:1996:FCP",
  pages =        "173--180",
  year =         "1996",
  bibdate =      "Wed Mar 18 12:33:18 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  annote =       "Also known as ICS'96. Held as part of the Federated
                 computing research conference (FCRC'96)",
  keywords =     "ACM; architecture; computer; FCRC; ICS; SIGARCH;
                 supercomputing",
}

@InProceedings{Gold:1996:UAL,
  author =       "C. Gold and T. Schnekenburger",
  title =        "Using the {ALDY} load distribution system for {PVM}
                 applications",
  crossref =     "Bode:1996:PVM",
  pages =        "278--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  keywords =     "ALDY adaptive load distribution system; ALDY function
                 library; libraries; load distribution strategies;
                 parallel application programming; parallel
                 applications; parallel program processes; parallel
                 programming; PVM applications; resource allocation;
                 software",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Govindan:1996:OMP,
  author =       "V. Govindan and Y. Park and X. Li and S. Crear and O.
                 Johnson",
  title =        "An overview of a {MPI} profiling environment for the
                 {NEC Cenju-3}",
  crossref =     "IEEE:1996:PSM",
  pages =        "185--188",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6115 (Programming support); C6120 (File
                 organisation); C6150G (Diagnostic, testing, debugging
                 and evaluating systems); C6150J (Operating systems);
                 C6150N (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "High Performance Comput. Center, Houston Univ., TX,
                 USA",
  keywords =     "application program interface; application program
                 interfaces; data visualisation; dynamic trace buffer
                 management; message passing; Message Passing Interface;
                 MPI applications; MPI profiling environment; MPP
                 research prototype; NEC Cenju-3; NSF Grand Challenge
                 Application Group; operating system; operating systems
                 (computers); parallel machines; program diagnostics;
                 software libraries; storage management; user-driven
                 visualization; virtual memory; virtual storage;
                 visualization tool",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Gropp:1996:HPM,
  author =       "W. Gropp and E. Lusk",
  title =        "A high-performance {MPI} implementation on a
                 shared-memory vector supercomputer",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "11",
  pages =        "1513--??",
  month =        "????",
  year =         "1996",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Mar 18 12:33:29 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gropp:1996:HPP,
  author =       "William Gropp and Ewing Lusk and Nathan Doss and
                 Anthony Skjellum",
  title =        "High-performance, portable implementation of the {MPI}
                 {Message Passing Interface Standard}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "6",
  pages =        "789--828",
  day =          "20",
  month =        sep,
  year =         "1996",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:15:01 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1996&volume=22&issue=6;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&volume=22&issue=6&aid=1075",
  acknowledgement = ack-nhfb,
  affiliation =  "Argonne Natl Lab",
  affiliationaddress = "Argonne, IL, USA",
  classification = "722.2; 722.4; 723; 723.1; 723.2; 902.2; C6110B
                 (Software engineering techniques); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "applications; Computer programming; Computer software
                 portability; Data communication systems; design goal;
                 distribution; environments; free; future developments;
                 high-performance portable implementation; Interfaces
                 (computer); library writers; message passing; Message
                 passing interface; MPI message; MPI-2; MPICH; parallel
                 computer vendors; Parallel processing systems; parallel
                 programming; Parallel programming environment; passing
                 interface standard; portable parallel programming
                 environment; programming; project management; software
                 libraries; software performance evaluation; software
                 portability; software standards; software tools;
                 specialists; specification; standard library;
                 Standards",
  treatment =    "P Practical",
}

@InProceedings{Hachler:1996:IAC,
  author =       "G. Hachler and H. Burkhart",
  title =        "Implementing the {ALWAN} communication and data
                 distribution library using {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "243--250",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6115 (Programming support);
                 C6140D (High level languages); C6150N (Distributed
                 systems software)",
  corpsource =   "Dept. of Inf., Basel Univ., Switzerland",
  keywords =     "ALWAN communication and data distribution; code
                 generation; CRAY T3D; environment; IBM SP2; INTEL
                 PARAGON; language programming; library; measurements;
                 message passing; mixed-; parallel application
                 programmability; parallel coordination language;
                 parallel languages; parallel programming; performance;
                 performance evaluation; performance portability;
                 programming environments; PVM; reusability; software;
                 software component reuse; software libraries; software
                 portability",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Haechler:1996:IAC,
  author =       "G. Haechler and H. Burkhart",
  title =        "Implementing the {ALWAN} Communication and Data
                 Distribution Library Using {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "243--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Heckathorn:1996:SSP,
  author =       "H. Heckathorn and B. Popp and W. Smith and D. Conklin
                 and D. A. Newman and F. Wieland",
  title =        "{SSGM}: from serial to parallel processing using
                 {PVM}",
  journal =      j-PROC-SPIE,
  volume =       "2741",
  pages =        "267--277",
  month =        "????",
  year =         "1996",
  CODEN =        "PSISDG",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  ISSN-L =       "0277-786X",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C3240K (Image sensors)C6185 (Simulation techniques);
                 C3360L (Aerospace control); C3375 (Military control
                 systems); C4260 (Computational geometry); C5220P
                 (Parallel architecture); C5260B (Computer vision and
                 image processing techniques); C5440 (Multiprocessing
                 systems); C6130B (Graphics techniques); C6150N
                 (Distributed systems software); C6160S (Spatial and
                 pictorial databases); C7460 (Aerospace engineering
                 computing)",
  conflocation = "Orlando, FL, USA; 9-11 April 1996",
  conftitle =    "Technologies for Synthetic Environments:
                 Hardware-in-the-Loop Testing",
  corpsource =   "Div. of Space Sci., Naval Res. Lab., Washington, DC,
                 USA",
  fjournal =     "Proceedings of the SPIE --- The International Society
                 for Optical Engineering",
  keywords =     "aerospace computing; aerospace simulation;
                 computational; computational speed requirements; data
                 visualisation; databases; geometry; guidance;
                 hardware-in-; heterogeneous computers; high-fidelity
                 real-time distributed simulation; high-fidelity scene
                 generation; image; infrared imaging; IR sensor testing;
                 latency; message; message passing system; military
                 computing; military systems; missile; missile defence
                 simulation; model; optical tracking; optimistic;
                 optimistic computing; parallel; parallel machines;
                 parallel processing; parallel virtual machine
                 programming environment; passing; physics-based
                 distributed simulation; physics-based phenomenology
                 models; problems; processing; programming environments;
                 protocols; radar; radar imaging; realistic images;
                 rendering (computer graphics); signatures;
                 surveillance; synchronization; synergistic; synthetic
                 scene generation; target RCS; target tracking;
                 technologies; the-loop simulation; tracking; virtual
                 machines; visual; visualisation",
  sponsororg =   "SPIE",
  treatment =    "P Practical",
}

@InProceedings{Hempel:1996:APT,
  author =       "R. Hempel and F. Zimmermann",
  title =        "On the automatic {PARMACS-to-MPI} transformation in
                 application programs",
  crossref =     "Liddell:1996:HPC",
  pages =        "1033--1034",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5610 (Computer interfaces); C6150E (General utility
                 programs); C6150N (Distributed systems software); C6155
                 (Computer communications software)",
  conftitle =    "High-Performance Computing and Networking.
                 International Conference and Exhibition HPCN Europe
                 1996",
  corpsource =   "German Nat. Res. Center for Inf. Technol., St.
                 Augustin, Germany",
  keywords =     "application program; application program interfaces;
                 computer interfaces; message passing; message passing
                 interface; PARMACS; translation tool",
  treatment =    "P Practical",
}

@InProceedings{Hempel:1996:SMM,
  author =       "R. Hempel",
  title =        "The status of the {MPI} message-passing standard and
                 its relation to {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "14--21",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming); C6150N (Distributed systems
                 software)",
  conftitle =    "Parallel Virtual Machine --- EuroPVM '96. Third
                 European PVM Conference. Proceedings",
  corpsource =   "Computations and Commun. Res. Labs., NEC Europe Ltd.,
                 Sankt Augustin, Germany",
  keywords =     "application program interfaces; de-facto standard;
                 domain; HPFF; Interface Forum; message passing;
                 Message-Passing; Message-Passing Interface Forum; MPI
                 message-passing standard; MPI-1; MPI-2; parallel;
                 parallel computing; parallel programming; Parallel
                 Virtual Machine; PARMACS; portability interfaces;
                 programming; public; public domain; PVM; software
                 packages; software portability; software standards;
                 virtual machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Hong:1996:RDM,
  author =       "Chul-Eui Hong and Bum-Sik Lee and Gi-Won On and
                 Dong-Hae Chi",
  title =        "Replay for debugging {MPI} parallel programs",
  crossref =     "IEEE:1996:PSM",
  pages =        "156--160",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Comput. Div., Electron. and Telecommun. Res. Inst.,
                 Taejeon, South Korea",
  keywords =     "application program interfaces; bitonic-merge sort;
                 blocking message passing events; communication errors;
                 cyclic debugging; execution replay algorithm; hazards
                 and race conditions; lexical analyzer; logical time
                 stamping algorithm; merging; message passing; message
                 race conditions; MPI parallel program debugging; MPI
                 standard; nonblocking message passing events;
                 nondeterministic characteristics; parallel programming;
                 program debugging; reference execution; reproducible
                 behavior; software libraries; sorting",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Huckle:1996:PIS,
  author =       "T. Huckle",
  title =        "{PVM}-Implementation of Sparse Approximate Inverse
                 Preconditioners for Solving Large Sparse Linear
                 Equations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "166--173",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4130 (Interpolation and function approximation);
                 C4140 (Linear algebra); C6110P (Parallel programming);
                 C7310 (Mathematics computing)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "access; algorithms; black-box solver; compressed
                 sparse column format; computing; fast; Gram--Schmidt
                 process; householder matrices; iterative methods;
                 iterative solution; large sparse linear equations;
                 least squares approximations; least-; master-slave;
                 mathematics; matrix columns; matrix inversion; matrix
                 multiplication; model; nonsymmetric ill-conditioned
                 matrix; normal equations; parallel; preconditioned
                 conjugate gradient algorithm; preconditioners; PVM
                 implementation; QR-decomposition; sparse approximate
                 inverse; sparse matrices; squares problem; submatrices;
                 unstructured; virtual machines",
  pubcountry =   "Germany",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@MastersThesis{Jones:1996:LLM,
  author =       "Chris R. Jones",
  title =        "Low latency {MPI} for {Meiko CS/2} and {ATM}
                 clusters",
  type =         "Thesis (M.A.)",
  school =       "Department of Computer Science, University of
                 California, Santa Barbara",
  address =      "Santa Barbara, CA, USA",
  year =         "1996",
  LCCN =         "QA76.27.C2 S25",
  bibdate =      "Fri Feb 04 17:35:04 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Ju:1996:SPT,
  author =       "Jiubin Ju and Yong Wang",
  title =        "Scheduling {PVM} Tasks",
  journal =      j-OPER-SYS-REV,
  volume =       "30",
  number =       "3",
  pages =        "22--31",
  month =        jul,
  year =         "1996",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "Dept. of Comput. Sci., Jilin Univ., Changchun, China",
  fjournal =     "Operating Systems Review",
  keywords =     "dynamically produced subtasks; environment; idle
                 workstations; job; parallel programming; pool tasks;
                 processor scheduling; PVM task scheduling; resource
                 utilization; response time; workstation cluster",
  treatment =    "P Practical",
}

@InProceedings{Juhasz:1996:PIP,
  author =       "Z. Juhasz and D. Crookes",
  title =        "A {PVM} implementation of a portable parallel image
                 processing library",
  crossref =     "Bode:1996:PVM",
  pages =        "188--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6140C (Optical information, image and video signal
                 processing); B6150C (Communication switching); B6210L
                 (Computer communications); C5260B (Computer vision and
                 image processing techniques); C5620L (Local area
                 networks); C6110B (Software engineering techniques);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6150N (Distributed systems software)",
  corpsource =   "Dept. of Inf. Syst., Veszprem Univ., Hungary",
  keywords =     "abstract communications layer; asynchronous transfer
                 mode; ATM network-based workstation clusters;
                 communication; Ethernet; extensibility; high-level
                 transparent; image processing; image processing
                 application development; layered; libraries; local
                 area; message passing; message-passing environment;
                 networks; parallel image processing library; parallel
                 programming; Parallel Virtual Machine; parallelism;
                 performance; portable; programming model; PVM
                 implementation; software; software model; software
                 portability; technologies; virtual machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Kafura:1996:CCC,
  author =       "D. Kafura and L. Huang",
  title =        "Collective communication and communicators in
                 {mpi++}",
  crossref =     "IEEE:1996:PSM",
  pages =        "79--86",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6120 (File organisation); C6140D (High level
                 languages); C6150E (General utility programs); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Virginia Polytech. Inst. and
                 State Univ., Blacksburg, VA, USA",
  keywords =     "abstract data types; application program interfaces;
                 attribute caching; C language; C++ language binding;
                 cache storage; class hierarchy; collective
                 communication; collective communicators; collective
                 service; contexts; data structures; groups; Intel
                 Paragon; message passing; MPI; mpi++; mpi++ program;
                 object-oriented languages; parallel algorithm; Sun
                 Sparc workstation; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Kale:1996:PMD,
  author =       "R. P. Kale and M. E. Fleharty and P. M. Alsing",
  title =        "Parallel molecular dynamics visualization using {MPI}
                 with {MPE} graphics",
  crossref =     "IEEE:1996:PSM",
  pages =        "104--110",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A6120J (Computer simulation of static and dynamic
                 liquid behaviour); C6110P (Parallel programming); C6115
                 (Programming support); C6130B (Graphics techniques);
                 C6150E (General utility programs); C7320 (Physics and
                 chemistry computing)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Chem. and Nucl. Eng., New Mexico Univ.,
                 Albuquerque, NM, USA",
  keywords =     "application program interfaces; atomic interactions;
                 boundary-value problems; data visualisation; digital
                 simulation; force decomposition; graphics rendering;
                 IBM SP1; IBM SP2; infinitely replicated confined
                 region; irregular geometries; load balancing; message
                 passing; Message Passing Interface; molecular dynamics
                 method; MPE graphics; MPI Extensions; OpenGL graphics
                 library; parallel molecular dynamics visualization;
                 parallel programming; periodic boundary conditions;
                 physics computing; portable algorithm; real- time 3D
                 object manipulation; real-time systems; rendering
                 (computer graphics); SGI Onyx high-end graphics
                 computer; sockets; software libraries; software
                 portability; workstation clusters; X-Windows calls",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Katkere:1996:VWI,
  author =       "A. Katkere and J. Schlenzig and R. Jain",
  title =        "{VRML-based WWW} interface to {MPI} Video",
  crossref =     "ACM:1996:SVR",
  pages =        "25--31, 137",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6130B (Graphics techniques); C6130M (Multimedia);
                 C6140D (High level languages); C7210 (Information
                 services and centres)",
  conftitle =    "Proceedings of 1995 VMRL Workshop",
  corpsource =   "Visual Comput. Lab., California Univ., San Diego, La
                 Jolla, CA, USA",
  keywords =     "hypermedia; hypermedia markups; interaction metaphor;
                 interactive television; interactive video; Internet;
                 motion information; MPI Video; multiple perspective
                 video streams; on-the-fly updating; page description
                 languages; query processing; simulation languages;
                 standard; three dimensional objects; three dimensional
                 scenes; video data; virtual reality; Virtual Reality
                 Modeling Language; VRML; VRML specification; World Wide
                 Web interface; WWW interface",
  sponsororg =   "San Diego Supercomput. Center; ACM",
  treatment =    "P Practical",
}

@InProceedings{Kermarrec:1996:PDS,
  author =       "Y. Kermarrec and L. Pautet",
  title =        "Programming Distributed Systems with Both {Ada} 95 and
                 {PVM}",
  crossref =     "Toussaint:1996:AES",
  pages =        "206--216",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6140D (High level languages); C7430
                 (Computer engineering)",
  corpsource =   "ENST de Bretagne, Brest, France",
  keywords =     "Ada; Ada 95; annex; communication architecture;
                 distributed; distributed system; distributed systems
                 programming; facilities; features; GNAT; low level;
                 parallel; parallel languages; parallel machines;
                 Parallel Virtual Machine; programming; PVM; virtual
                 machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Kohl:1996:PTF,
  author =       "J. A. Kohl and G. A. Geist",
  title =        "The {PVM} 3.4 Tracing Facility and {XPVM} 1.1",
  crossref =     "El-Rewini:1996:PTN",
  volume =       "1",
  pages =        "290--299",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6150G (Diagnostic, testing, debugging
                 and evaluating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Div. of Comput. Sci. and Math., Oak Ridge Nat. Lab.,
                 TN, USA",
  keywords =     "buffering; diagnostics; evaluation; event mask;
                 format; graphical user interfaces; heterogeneous
                 environment; library; mechanism; message passing;
                 on-the-fly adjustment; parallel; parallel programming;
                 Parallel Virtual Machine; performance tuning; program;
                 program compilers; program debugging; program execution
                 histories; program monitoring; programming; PVM 3.4;
                 PVM library; run-time; self-defining data;
                 shared-memory multiprocessors; software libraries;
                 software performance; trace; trace event definition;
                 trace events; tracing facility; tracing tool;
                 user-defined custom; virtual machines; workstation
                 clusters; XPVM 1.1",
  sponsororg =   "Univ. Hawaii; Univ. Hawaii College of Bus. Adm",
  treatment =    "P Practical",
}

@InProceedings{Kormicki:1996:PLS,
  author =       "M. Kormicki and A. Mahmood and B. S. Carlson",
  title =        "Parallel logic simulation on a network of workstations
                 using {PVM}",
  crossref =     "IEEE:1996:EIS",
  pages =        "2--9",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B1130B (Computer-aided circuit analysis and design);
                 B1265B (Logic circuits); C5210B (Computer-aided logic
                 design); C5440 (Multiprocessing systems); C7410D
                 (Electronic engineering computing)",
  corpsource =   "Washington State Univ., Richland, WA, USA",
  keywords =     "activity level; ATM; balance; CAD; circuit analysis
                 computing; combinational circuits; driven logic
                 simulation algorithm; Ethernet; gate evaluations; high
                 performance; ISCAS; ISCAS combinational benchmark
                 circuits; load; logic; logic testing; network of
                 workstations; output event-; parallel logic simulation;
                 parallel machines; parallel virtual machine;
                 performance; PVM; random partitioning; semi-optimistic
                 scheme; sequential benchmark circuits; sequential
                 circuits; switched; virtual machines",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Comput.
                 Architecture; IEEE Comput. Soc. Tech Committee on
                 Distributed Process.; IEEE Comput. Soc. Dallas
                 Chapter",
  treatment =    "A Application; P Practical",
}

@InProceedings{Kotsis:1996:EEP,
  author =       "G. Kotsis and F. Sukup",
  title =        "Efficiency Evaluation of {PVM 2.X}, {PVM 3.X}, {P4},
                 {EXPRESS} and {LINDA} on a Workstation Cluster Using
                 the {NAS} Parallel Benchmarks",
  crossref =     "Zaky:1996:PDT",
  pages =        "149--171",
  year =         "1996",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Krantz:1996:RFP,
  author =       "A. T. Krantz and A. Zadroga and S. E. Chodrow and V.
                 S. Sunderam",
  title =        "An {RPC} facility for {PVM}",
  crossref =     "Liddell:1996:HPC",
  pages =        "798--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems software)",
  corpsource =   "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  keywords =     "adaptive parallelism; client-server; client-server
                 systems; computing; concurrent computing; distributed
                 applications; failure resilience; heterogeneous
                 environments; message passing; message-;
                 message-passing systems; parallel processing; parallel
                 virtual machine; passing paradigm; processor
                 scheduling; remote procedure call; remote procedure
                 calls; user-transparent load balancing",
  pubcountry =   "Germany",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Krone:1996:ICF,
  author =       "O. Krone and M. Aguilar and B. Hirsbrunner and V.
                 Sunderam",
  title =        "Integrating Coordination Features in {PVM}",
  crossref =     "Ciancarini:1996:CLM",
  pages =        "432--435",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "Inst. d'Inf., Fribourg Univ., Switzerland",
  keywords =     "client/server; coordination; extended coordination;
                 features; generative communication; message passing;
                 parallel programming; parallel systems; programming;
                 PVM",
  pubcountry =   "Germany",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Lawton:1996:BHP,
  author =       "J. V. Lawton and J. J. Brosnan and M. P. Doyle and S.
                 D. O. Riordain and T. G. Reddin",
  title =        "Building a high-performance message-passing system for
                 {MEMORY CHANNEL} clusters",
  journal =      j-DEC-TECH-J,
  volume =       "8",
  number =       "2",
  pages =        "96--116",
  month =        oct,
  year =         "1996",
  CODEN =        "DTJOEL",
  ISSN =         "0898-901X",
  bibdate =      "Thu Mar 20 18:15:43 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.digital.com:80/DTJM08/DTJM08P8.PS",
  abstract =     "The new MEMORY CHANNEL for PCI cluster interconnect
                 technology developed by Digital (based on technology
                 from Encore Computer Corporation) dramatically reduces
                 the overhead involved in intermachine communication.
                 Digital has designed a software system, the TruCluster
                 MEMORY CHANNEL Software version 1.4 product, that
                 provides fast user-level access to the MEMORY CHANNEL
                 network and can be used to implement a form of
                 distributed shared memory. Using this product, Digital
                 has built a low-level message- passing system that
                 reduces the communications latency in a MEMORY CHANNEL
                 cluster to less than 10 microseconds. This system can,
                 in turn, be used to easily build the communications
                 libraries that programmers use to parallelize
                 scientific codes. Digital has demonstrated the
                 successful use of this message-passing system by
                 developing implementations of two of the most popular
                 of these libraries, Parallel Virtual Machine (PVM) and
                 Message Passing Interface (MPI).",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C6120 (File
                 organisation); C6150N (Distributed systems software)",
  fjournal =     "Digital Technical Journal",
  keywords =     "access; clusters; communications latency;
                 communications libraries; Computer Corporation;
                 distributed shared memory; Encore; high-performance
                 message-passing system; intermachine communication;
                 Machine; MEMORY CHANNEL; message passing; Message
                 Passing Interface; Parallel Virtual; PCI cluster
                 interconnect technology; scientific codes; software;
                 storage management; system; TruCluster MEMORY CHANNEL
                 Software; user-level",
  treatment =    "P Practical",
}

@Article{Lee:1996:TSF,
  author =       "Bu-Sung Lee and A. Heng and W. Cai and Tai-Ann Tan",
  title =        "Task scheduling facility for {PVM}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "6",
  number =       "4",
  pages =        "563--574",
  month =        dec,
  year =         "1996",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Tue Oct 21 18:27:39 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110B (Software
                 engineering techniques); C6150N (Distributed systems
                 software)",
  corpsource =   "Sch. of Appl. Sci., Nanyang Technol. Univ.,
                 Singapore",
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
  keywords =     "centralized task scheduler; client server system;
                 client-server systems; design issue; heterogeneous
                 computer systems; library routines; load balancing;
                 parallel machines; Parallel Virtual Machine; PVM;
                 resource allocation; round-robin task allocation
                 scheme; scheduling; software libraries; software
                 portability; task scheduling; virtual machines; virtual
                 metacomputer; workstations",
  pubcountry =   "Singapore",
  treatment =    "P Practical",
}

@InProceedings{Liang:1996:AEO,
  author =       "Wen-Yew Liang and Chun-Ta King and Feipei Lai",
  title =        "{Adsmith}: an efficient object-based distributed
                 shared memory system on {PVM}",
  crossref =     "Li:1996:SIS",
  pages =        "",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6110J (Object-oriented
                 programming)",
  corpsource =   "Dept. of Comput. Sci. and Inf. Eng., Nat. Taiwan
                 Univ., Taipei, Taiwan",
  keywords =     "accesses; Adsmith; atomic operations; communication
                 subsystem; consistency; distributed memory systems;
                 distributed shared memory system; load/store-like
                 memory accesses; memory; memory systems; nonblocking;
                 object-oriented programming; parallel architectures;
                 performance; performance evaluation; PVM; release
                 memory; shared; shared objects",
  sponsororg =   "Chinese Nat. Res. Center for Intelligent Comput.
                 Syst.; IEEE Comput. Soc.; IEEE Comput. Soc. Tech.
                 Committee on Parallel Process.; Steering Committee of
                 the Chinese Nat. Hi-Tech Programme; Inf. Process. Soc.
                 Japan; Chinese Comput. Federation; IEICE Inf. and Syst.
                 Soc",
  treatment =    "P Practical",
}

@InProceedings{Liu:1996:BMP,
  author =       "L. T. Liu and D. E. Culler and C. Yoshikawa",
  title =        "Benchmarking message passing performance using {MPI}",
  crossref =     "Reeves:1996:PIC",
  volume =       "1",
  pages =        "101--110",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings of 25th International Conference on
                 Parallel Processing",
  corpsource =   "Comput. Sci. Div., Berkeley Univ., CA, USA",
  keywords =     "benchmarks; IBM SP2; Intel Paragon; message passing;
                 message passing performance; microbenchmarks; MPI;
                 parallel machines; performance evaluation; SGI Power
                 Challenge",
  sponsororg =   "Int. Assoc. Comput. and Commun.; Pennsylvania State
                 Univ",
  treatment =    "P Practical",
}

@InProceedings{Loos:1996:MPS,
  author =       "T. Loos and R. Bramley",
  title =        "{MPI} performance on the {SGI Power Challenge}",
  crossref =     "IEEE:1996:PSM",
  pages =        "203--206",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6150E (General
                 utility programs); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  keywords =     "application program interfaces; collective
                 inter-processor communication; communications
                 efficiency; communications overhead; communications
                 tests; cost function; double precision arrays; graph
                 partitioning algorithm; memory copying; memory
                 performance curves; memory tests; message passing; MPI
                 performance; MPI performance curves; MPI standard;
                 parallel algorithms; parallel computers; performance
                 evaluation; point-to-point inter-processor
                 communication; primitives; second level cache; SGI
                 Power Challenge; shared memory multiprocessor; shared
                 memory systems; software performance evaluation;
                 synchronisation; synchronization; total message sizes;
                 utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Lu:1996:PIF,
  author =       "E. J.-L. Lu and D. I. Okunbor",
  title =        "Parallel implementation of {3D FMA} using {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "119--124",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A0270 (Computational techniques); A0320 (Classical
                 mechanics of discrete systems: general mathematical
                 aspects); A0545 (Theory and models of chaotic systems);
                 A9510C (Celestial mechanics); A9575P (Mathematical and
                 computer techniques in astronomy); C4240C
                 (Computational complexity); C4240P (Parallel
                 programming and algorithm theory); C6110P (Parallel
                 programming); C6150E (General utility programs)C7330
                 (Biology and medical computing); C6150N (Distributed
                 systems software); C7320 (Physics and chemistry
                 computing); C7350 (Astronomy and astrophysics
                 computing)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Missouri Univ., Rolla, MO,
                 USA",
  keywords =     "3D fast multipole algorithm; application program
                 interfaces; astronomy computing; astrophysics;
                 biochemistry; biology computing; biomolecular dynamics;
                 biophysics; chaos; chaotic characteristics; chemistry
                 computing; communication back-end; communication
                 overhead; computational complexity; digital simulation;
                 galactic system; load balancing; long-range force
                 calculation; message passing; Message Passing
                 Interface; MPI; N-body problems; N-body systems
                 simulation; parallel algorithms; parallel
                 implementation; partitioning technique; physics
                 computing; portable scalable parallel library; resource
                 allocation; time complexity",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Manis:1996:EPT,
  author =       "G. Manis and C. Voliotis and P. Tsanakas and G.
                 Papakonstantinou",
  title =        "Enhancing {PVM} with threads in distributed
                 programming",
  crossref =     "Liddell:1996:HPC",
  pages =        "1013--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software)",
  corpsource =   "Athens Nat. Tech. Univ., Greece",
  keywords =     "distributed programming; environment; Orchid; parallel
                 programming; platform; portable features; PVM;
                 software; software portability; thread-oriented PVM;
                 threads",
  pubcountry =   "Germany",
  treatment =    "G General Review; P Practical",
}

@InProceedings{Markus:1996:PEM,
  author =       "S. Markus and S. B. Kim and K. Pantazopoulos and A. L.
                 Ocken and E. N. Houstis and P. Wu and S. Weerawarana
                 and D. Maharry",
  title =        "Performance evaluation of {MPI} implementations and
                 {MPI} based {Parallel ELLPACK} solvers",
  crossref =     "IEEE:1996:PSM",
  pages =        "162--169",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4140 (Linear algebra); C4170 (Differential
                 equations); C4185 (Finite element analysis); C6150N
                 (Distributed systems software); C7310 (Mathematics
                 computing)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
                 IN, USA",
  keywords =     "application program interfaces; distributed memory
                 architectures; domain decomposition; elliptic boundary
                 value problems; elliptic equations; finite difference
                 methods; finite element mesh generation; iterative
                 solvers; ITPACK library; mathematics computing; mesh
                 generation; mesh partitioning; message passing; message
                 passing communication libraries; MIMD; MPI; Parallel
                 ELLPACK; parallel mesh generator; partial differential
                 equations; problem solving environment; PVM; second
                 order elliptic partial differential equations; software
                 libraries; software performance evaluation; sparse
                 algebraic equations; sparse matrices; workstation
                 clusters",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Martin:1996:WTW,
  author =       "D. E. Martin and T. J. McBrayer and P. A. Wilsey",
  editor =       "H. El-Rewini and B. D. Shriver",
  booktitle =    "{Proceedings of the Twenty-Ninth Hawaii International
                 Conference on System Sciences}",
  title =        "{WARPED}: a time warp simulation kernel for analysis
                 and application development",
  volume =       "1",
  publisher =    "????",
  address =      "????",
  pages =        "5--??",
  year =         "1996",
  ISBN =         "0-8186-7324-9",
  ISBN-13 =      "978-0-8186-7324-5",
  LCCN =         "????",
  bibdate =      "Sun Apr 13 12:29:32 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "WARPED is a publicly-available time warp simulation
                 kernel for experimentation and application development.
                 The kernel defines a standard interface to the
                 application developer and is designed to provide a
                 highly configurable environment for the integration of
                 time warp optimizations. It is written in C++, uses the
                 MPI (Message Passing Interface) standard and shared
                 memory for communication, and executes on a variety of
                 platforms including a network of SUN workstations, a
                 SUN SMP workstation, the IBM SP1/SP2 multiprocessors,
                 the Intel Paragon and IBM-compatible PCs running Linux.
                 WARPED is distributed with several applications and
                 includes a sequential kernel implementation for
                 comparative analysis. The kernel supports LP (logical
                 process) clustering, various time warp algorithms and
                 several optimizations that dynamically adjust
                 simulation parameters.",
  acknowledgement = ack-nhfb,
  classcodes =   "C6185 (Simulation techniques); C6115 (Programming
                 support); C6150N (Distributed systems software); C6110P
                 (Parallel programming)",
  conflocation = "Wailea, HI, USA; 3-6 Jan. 1996",
  conftitle =    "Proceedings of HICSS-29: 29th Hawaii International
                 Conference on System Sciences",
  corpsource =   "Dept. of ECECS, Cincinnati, OH, USA",
  keywords =     "analysis; application; application program interfaces;
                 applications; C++; comparative; configurable
                 environment; development; development systems; discrete
                 event simulation; dynamic simulation; IBM compatible;
                 IBM SP1/SP2 multiprocessors; Intel Paragon; Linux;
                 logical process clustering; memory systems; message
                 passing; Message Passing Interface; microcomputer; MPI
                 standard; optimisation; optimizations; parallel
                 algorithms; parameter adjustment; PCs; sequential
                 kernel implementation; shared; shared memory; SUN SMP
                 workstation; SUN workstation network; synchronisation;
                 time warp; time warp simulation; time warp simulation
                 kernel; WARPED",
  sponsororg =   "Univ. Hawaii; Univ. Hawaii College of Bus. Adm",
  treatment =    "P Practical",
}

@InProceedings{McCandless:1996:OOM,
  author =       "B. C. McCandless and J. M. Squyres and A. Lumsdaine",
  title =        "Object Oriented {MPI} ({OOMPI}): a class library for
                 the {Message Passing Interface}",
  crossref =     "IEEE:1996:PSM",
  pages =        "87--94",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110F (Formal methods)C6140D (High level
                 languages); C6110J (Object-oriented programming);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6150E (General utility programs)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci. and Eng., Notre Dame Univ., IN,
                 USA",
  keywords =     "application program interfaces; C language; C++
                 bindings; C++ class library; formal specification;
                 generic specification; message passing; Message Passing
                 Interface; object-oriented class library;
                 object-oriented languages; Object-Oriented MPI;
                 object-oriented programming; OOMPI; parallel
                 programming; program description language; software
                 libraries",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{McDonald:1996:NNP,
  author =       "K. McDonald",
  title =        "The {NAG Numerical PVM Library}",
  crossref =     "Dongarra:1996:APC",
  pages =        "419--428",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4100 (Numerical analysis); C4240P (Parallel
                 programming and algorithm theory); C5220P (Parallel
                 architecture); C6110P (Parallel programming); C6115
                 (Programming support); C6150N (Distributed systems
                 software)",
  corpsource =   "Numerical Algorithms Group Ltd., Oxford, UK",
  keywords =     "analysis; distributed memory systems;
                 distributed-memory; efficient software; general-purpose
                 numerical library; machines; message passing; NAG
                 Fortran 77 Library; NAG Numerical PVM Library;
                 numerical; Numerical Algorithms Group; parallel
                 algorithms; parallel programming; parallel software;
                 portable; public-domain message-passing; ScaLAPACK
                 project; scalar computers; shared-memory computers;
                 software; software libraries; software portability;
                 state-of-the-art; systems; vector computers",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{McMahon:1996:EEE,
  author =       "T. P. McMahon and A. Skjellum",
  title =        "{eMPI\slash eMPICH}: embedding {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "180--184",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6115
                 (Programming support); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  keywords =     "application program interface; application program
                 interfaces; bottom-up design; design paradigms;
                 embeddable libraries; embeddable MPI versions; eMPI;
                 eMPICH; memory-constrained systems; message passing;
                 real-time systems; software libraries; systems
                 analysis; top-down design",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Menden:1996:PPP,
  author =       "J. Menden and G. Stellner",
  title =        "Proving properties of {PVM} applications --- a case
                 study with {CoCheck}",
  crossref =     "Bode:1996:PVM",
  pages =        "134--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110F (Formal methods); C6110P (Parallel programming);
                 C6150G (Diagnostic, testing, debugging and evaluating
                 systems); C6150N (Distributed systems software); C7430
                 (Computer engineering)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  keywords =     "case study; checkpoint; CoCheck; creation; distributed
                 algorithms; formal method; machines; parallel
                 applications; parallel programming; Parallel Virtual
                 Machine; program; programming theory; properties;
                 proving; PVM applications; systems software;
                 verification; virtual; workstation clusters",
  pubcountry =   "Germany",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Miei:1996:IER,
  author =       "T. Miei and N. Takahashi",
  title =        "Implementation and evaluation of a replay-based
                 debugger for {PVM} programs",
  journal =      j-TRANS-INFO-PROCESSING-SOC-JAPAN,
  volume =       "37",
  number =       "7",
  pages =        "1308--1319",
  month =        jul,
  year =         "1996",
  CODEN =        "JSGRD5",
  ISSN =         "0387-5806",
  ISSN-L =       "0387-5806",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150N (Distributed systems
                 software)",
  fjournal =     "Transactions of the Information Processing Society of
                 Japan",
  keywords =     "code; dbxR; demand-driven replay method; dynamic
                 execution sequences; message passing; message passing
                 communications; message passing library;
                 nondeterministic execution behavior; parallel
                 programming; parallel programs; program debugging;
                 program debugging evaluation; PVM programs;
                 replay-based debugger; shared memory systems;
                 shared-memory parallel programs; software performance
                 evaluation; static source",
  language =     "Japanese",
  pubcountry =   "Japan",
  treatment =    "P Practical",
}

@Article{Miguel:1996:APN,
  author =       "Jose Miguel and Agustin Arruabarrena and Ramon Beivide
                 and Jose Angel Gregorio",
  title =        "Assessing the performance of the new {IBM SP2}
                 communication subsystem",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "4",
  number =       "4",
  pages =        "12--22",
  month =        "Winter",
  year =         "1996",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/88.544433",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Fri Apr 11 07:24:28 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Universidad del Pais Vasco",
  affiliationaddress = "Spain",
  classification = "716.1; 721.1; 722.2; 722.4; 723; 912.3; C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6150N (Distributed systems
                 software)",
  corpsource =   "Dept. of Comput. Archit. and Technol., Univ. del Pais
                 Vasco, San Sebastian, Spain",
  fjournal =     "IEEE parallel and distributed technology: systems and
                 applications",
  journalabr =   "IEEE Parallel Distrib Technol",
  keywords =     "Bandwidth; basic; collective communication;
                 Communication channels (information theory);
                 communication tests; Computer software; Computer
                 testing; computers; distributed memory systems;
                 evaluation; execution; execution time; Execution times;
                 fault tolerant computing; high performance switch; IBM;
                 IBM SP2 communication subsystem; Interconnection
                 networks; interface adapters; Interfaces (computer);
                 latency; Memory latency; message; message passing;
                 Message passing interface (MPI); Microprocessor chips;
                 MPI message passing library; parallel applications;
                 parallel computer; parallel machines; Parallel
                 processing systems; Parallel virtual machine (pvm);
                 performance; performance assessment; performance
                 indicators; real applications; reliability; SP2;
                 Switching; Synchronization; Systems analysis; tests;
                 throughput; times",
  treatment =    "P Practical",
}

@InProceedings{Mo:1996:IOP,
  author =       "J. Mo and F. Romelfanger and R. J. Hanisch and D.
                 Redding and S. Sirlin and A. Boden",
  title =        "Implementation of an optical prescription retrieval
                 code using {PVM} (parallel virtual machine) in a mixed
                 architecture network",
  crossref =     "Jacoby:1996:ADA",
  pages =        "100--103",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A9575P (Mathematical and computer techniques in
                 astronomy); C5440 (Multiprocessing systems); C6150N
                 (Distributed systems software); C7350 (Astronomy and
                 astrophysics computing); C7430 (Computer engineering)",
  corpsource =   "Space Telescope Sci. Inst., Baltimore, MD, USA",
  keywords =     "astronomy computing; machine; mixed architecture
                 network; optical prescription retrieval code; parallel;
                 parallel computing application; parallel machines;
                 parallel virtual; performance comparisons; programming;
                 PVM software system; virtual machines",
  treatment =    "X Experimental",
}

@InProceedings{Muller:1996:CDI,
  author =       "A. Muller and R. Ruhl",
  title =        "Communication-buffers for data-parallel, irregular
                 computations",
  crossref =     "Szymanski:1996:LCR",
  pages =        "295--298",
  year =         "1996",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Centro Svizzero di Calcolo Sci., Eidgenossische Tech.
                 Hochschule, Manno, Switzerland",
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6120 (File organisation); C6140D (High
                 level languages); C6150C (Compilers, interpreters and
                 other processors)",
  keywords =     "Buffer organizations; Common user interface;
                 Communication buffers; Compiler generated parallel
                 programs; Critical code segments; Data parallel
                 irregular computations; Data-parallel irregular
                 computations; Distributed data; Distributed memory
                 parallel processors; Distributed programs; High
                 Performance Fortran compiler; HPF extensions; Low level
                 machine interface; Message Passing Interface;
                 Parallelization Support Tool; Performance monitor;
                 Portable integrated tool environment Annai; PST HPF
                 extensions; Run time preprocessing; Source level
                 debugger; Unstructured computations",
  thesaurus =    "Buffer storage; Distributed memory systems; FORTRAN;
                 Parallel languages; Parallel programming; Parallelising
                 compilers",
}

@Article{Nagel:1996:VVA,
  author =       "W. E. Nagel and A. Arnold and M. Weber and H. C. Hoppe
                 and K. Solchenbach",
  title =        "{VAMPIR}: Visualization and Analysis of {MPI}
                 Resources",
  journal =      j-SUPERCOMPUTER,
  volume =       "12",
  number =       "1",
  pages =        "69--80",
  month =        jan,
  year =         "1996",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Central Inst. for Appl. Math., Res. Centre Julich,
                 Germany",
  classification = "C6110P (Parallel programming); C6110S (Software
                 metrics); C6150N (Distributed systems software); C6155
                 (Computer communications software)",
  corpsource =   "Central Inst. for Appl. Math., Res. Centre Julich,
                 Germany",
  fjournal =     "Supercomputer",
  keywords =     "activity chart; Activity chart; analysis; Analysis;
                 animation mode; Animation mode; computer interfaces;
                 flexible filter operation; Flexible filter operation;
                 information display reduction; Information display
                 reduction; message passing; message passing interface;
                 Message passing interface; message passing standard;
                 Message passing standard; MPI; MPI resource; parallel
                 programming; Parallel programming; PARvis; performance
                 bottleneck location; Performance bottleneck location;
                 software performance analysis; Software performance
                 analysis; software performance evaluation; state
                 diagram; State diagram; statistics; Statistics;
                 time-line displays,; Time-line displays,; tracing;
                 Tracing; VAMPIR; VAMPIR:; visualization; Visualization;
                 zooming; Zooming",
  pubcountry =   "Netherlands",
  thesaurus =    "Computer interfaces; Message passing; Parallel
                 programming; Software performance evaluation",
  treatment =    "P Practical",
}

@InProceedings{NicCanna:1996:LGS,
  author =       "C. {Nic Canna} and C. J. Bean",
  title =        "Larger grids and shorter wall-clock times on a
                 parallel virtual machine ({PVM}) --- an example using a
                 finite difference wave simulation algorithm",
  crossref =     "Abrahart:1996:GIC",
  volume =       "2",
  pages =        "2--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4170 (Differential equations); C6150N (Distributed
                 systems software); C6185 (Simulation techniques); C7340
                 (Geophysics computing); C7430 (Computer engineering)",
  corpsource =   "Dept. of Geol., Univ. Coll. Dublin, Ireland",
  keywords =     "acoustic wave equation; acoustic waves; algorithm;
                 array sizes; computer modelling; digital simulation;
                 Earth; finite difference; finite difference solution;
                 finite difference wave simulation; geologically
                 realistic; geophysics computing; machines; message
                 passing; methods; parallel; parallel virtual machine;
                 PVM message passing library; sciences; seismic wave;
                 seismology; simulation method; virtual machines;
                 virtually parallel machine; wall clock times; wave
                 equations",
  pubcountry =   "UK",
  treatment =    "P Practical",
}

@InProceedings{Oberhuber:1996:MNP,
  author =       "M. Oberhuber",
  title =        "Managing nondeterminism in {PVM} programs",
  crossref =     "Bode:1996:PVM",
  pages =        "347--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6150G (Diagnostic, testing, debugging and
                 evaluating systems)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  keywords =     "interprocess communication; nondeterminism; parallel
                 machines; parallel programming; parallel programs;
                 program debugging; program testing; PVM programs;
                 TOOLSET environment; virtual machines",
  pubcountry =   "Germany",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Ogawa:1996:OOM,
  author =       "Hirotaka Ogawa and Satoshi Matsuoka",
  title =        "{OMPI}: Optimizing {MPI} Programs Using Partial
                 Evaluation",
  crossref =     "ACM:1996:SCP",
  pages =        "??--??",
  year =         "1996",
  bibdate =      "Mon Mar 23 12:31:18 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.supercomp.org/sc96/proceedings/SC96PROC/OGAWA/INDEX.HTM",
  acknowledgement = ack-nhfb,
}

@InProceedings{Papakostas:1996:PPP,
  author =       "N. Papakostas and G. Papakonstantinou and P.
                 Tsanakas",
  title =        "{PPARDB\slash PVM}: a portable {PVM} based parallel
                 database management system",
  crossref =     "Boszormenyi:1996:PCT",
  pages =        "",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6160B
                 (Distributed databases)C6110P (Parallel programming);
                 C6160D (Relational databases); C7430 (Computer
                 engineering)",
  corpsource =   "Dept. of Electr. and Comput. Eng., Nat. Tech. Univ. of
                 Athens, Greece",
  keywords =     "architecture; computational model; crowd; database
                 processing elements; databases; distributed databases;
                 heterogeneous workstation; horizontal; layered;
                 multicasting; network; one master/multiple slaves;
                 operating system dependencies; operator parallelism;
                 parallel database management system; parallel
                 programming; partitioning; portable; portable PVM based
                 parallel database management; PPARDB/PVM; process
                 synchronisation; relation tuples; relational;
                 relational parallel database management system;
                 relationship; scientific programming; shared nothing;
                 software portability; system; tasks; transputer
                 network; virtual machines; virtual parallel computer;
                 workstations",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Papakostas:1996:PSP,
  author =       "N. Papakostas and G. Papakonstantinou and P.
                 Tsanakas",
  title =        "{PPARDB} \slash{ PVM}: a Portable {PVM} Based Parallel
                 Database Management System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1127",
  pages =        "219--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Papakostas:1996:UPI,
  author =       "N. Papakostas and G. Papakonstantinou and P.
                 Tsanakas",
  title =        "Using {PVM} to implement {PPARDB\slash PVM}, a
                 portable parallel database management system",
  crossref =     "Bode:1996:PVM",
  pages =        "108--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming)C7430 (Computer engineering);
                 C6160B (Distributed databases)",
  corpsource =   "Dept. of Electr. Eng., Nat. Tech. Univ. of Athens,
                 Greece",
  keywords =     "distributed databases; heterogeneous workstation
                 network; operator parallelism; parallel programming;
                 parallel systems; portability; portable communication
                 primitives; portable parallel database management;
                 PPARDB/PVM; PVM; separate database node; shared nothing
                 architecture; software; system; virtual machines;
                 workstations",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Pernice:1996:RPP,
  author =       "Michael Pernice",
  title =        "Review of ``{PVM: Parallel Virtual Machine. A User's
                 Guide and Tutorial for Networked Parallel
                 Computing}''",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "4",
  number =       "1",
  pages =        "84--84",
  month =        "Spring",
  year =         "1996",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/M-PDT.1996.481711",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Tue Jan 23 16:38:43 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/pd/books/pd1996/pdf/p1084.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE parallel and distributed technology: systems and
                 applications",
}

@Article{Pokorny:1996:CMP,
  author =       "S. Pokorny",
  title =        "A Comparison of Message-Passing Parallelization to
                 Shared-Memory Parallelization",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "22--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pruyne:1996:ICP,
  author =       "Jim Pruyne and Miron Livny",
  title =        "Interfacing {Condor} and {PVM} to harness the cycles
                 of workstation clusters",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "12",
  number =       "1",
  pages =        "67--85",
  month =        may,
  year =         "1996",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jul 15 09:06:07 MDT 2005",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Parallel/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  classification = "C5620 (Computer networks and techniques); C6110P
                 (Parallel programming); C6150J (Operating systems);
                 C6150N (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Wisconsin Univ., Madison, WI,
                 USA",
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "allocation; Condor; network operating systems;
                 parallel; parallel programming; processing; PVM;
                 resource; resource management; resource management
                 system; workstation clusters",
  pubcountry =   "Netherlands",
  remark =       "Resource Management in Distributed Systems",
  treatment =    "P Practical",
}

@Article{Qaddouri:1996:CPC,
  author =       "A. Qaddouri and R. Roy and M. Mayrand and B. Goulard",
  title =        "Collision Probability Calculation and Multigroup Flux
                 Solvers Using {PVM}",
  journal =      j-NUCL-SCI-ENG,
  volume =       "123",
  number =       "3",
  pages =        "392--402",
  month =        jul,
  year =         "1996",
  CODEN =        "NSENAO",
  ISSN =         "0029-5639",
  ISSN-L =       "0029-5639",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A0260 (Numerical approximation and analysis); A2820H
                 (Neutron diffusion); A2841C (Computer codes for fission
                 reactor theory and design)",
  conflocation = "Portland, OR, USA; 30 April-4 May 1995",
  conftitle =    "International Conference on Mathematics and
                 Computations, Reactor Physics, and Environmental
                 Analyses",
  corpsource =   "Inst. de Genie Nucl., Ecole Polytech. de Montreal,
                 Que., Canada",
  fjournal =     "Nuclear Science and Engineering",
  keywords =     "collision probability; cyclic; IBM SP2; iterative
                 methods; iterative process; linearized; multigroup flux
                 solvers; multigroup transport equation; neutron flux;
                 neutron transport theory; nuclear engineering
                 computing; PVM library; run times; SPARC 1000;
                 time-independent transport equation; tracking; two-step
                 energy/space",
  sponsororg =   "ANS; Eur. Nucl. Soc.; Atomic Energy Soc. Japan",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Ragg:1996:PEN,
  author =       "T. Ragg",
  title =        "Parallelization of an evolutionary neural network
                 optimizer based on {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "351--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C1180 (Optimisation techniques); C4240P (Parallel
                 programming and algorithm theory); C5290 (Neural
                 computing techniques); C6150J (Operating systems);
                 C6150N (Distributed systems software)",
  corpsource =   "Institut fur Logik, Karlsruhe Univ., Germany",
  keywords =     "allocation; batch processing (computers); batch
                 program; dynamic load balancing; ENZO; evolutionary
                 neural network optimizer; genetic algorithms; machine
                 load; nets; neural; parallel algorithms;
                 parallelization; pattern recognition; PVM; resource;
                 workstation-cluster",
  pubcountry =   "Germany",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Reimann:1996:CBT,
  author =       "D. A. Reimann and V. Chaudhary and M. J. Flynn and I.
                 K. Sethi",
  title =        "Cone beam tomography using {MPI} on heterogeneous
                 workstation clusters",
  crossref =     "IEEE:1996:PSM",
  pages =        "142--148",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5260B (Computer vision and image processing
                 techniques); C5440 (Multiprocessing systems); C5620L
                 (Local area networks); C6150N (Distributed systems
                 software); C7410H (Computerised instrumentation)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  keywords =     "application program interfaces; asynchronous
                 communication; asynchronous MPI; backprojection;
                 computerised tomography; cone beam tomography;
                 heterogeneous workstation clusters; image
                 reconstruction; load balancing; local area networks;
                 memory requirements; message passing; Message Passing
                 Interface; MPI; parallel methods; processing time;
                 processor utilization; projection views; resource
                 allocation; software libraries",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Robinson:1996:TMI,
  author =       "J. Robinson and S. H. Russ and B. Flachs and B.
                 Heckel",
  title =        "A task migration implementation of the
                 {Message-Passing Interface}",
  crossref =     "IEEE:1996:PFI",
  pages =        "61--68",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150E (General
                 utility programs); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings of 5th IEEE International Symposium on
                 High Performance Distributed Computing",
  corpsource =   "NSF Eng. Res. Center for Comput. Field Simulation,
                 Mississippi State Univ., MS, USA",
  keywords =     "application program interfaces; Hector; heterogeneous
                 computing task allocator; heterogeneous platforms;
                 message passing; Message-Passing Interface; MPI
                 specification; parallel processing applications;
                 parallel program performance improvement; parallel
                 programming; software performance evaluation; task
                 migration implementation; workstation networks",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process.; Northeast Parallel Architectures Center; New
                 York State Center for Adv. Technol. Comput.
                 Applications and Software Eng. (CASE Center) at
                 Syracuse Univ.; Rome Lab",
  treatment =    "P Practical",
}

@InProceedings{Roda:1996:PEI,
  author =       "J. Roda and J. Herrera and J. Gonzalez and C.
                 Rodriguez and F. Almeida and D. Gonzalez",
  title =        "Practical experiments to improve {PVM} algorithms",
  crossref =     "Bode:1996:PVM",
  pages =        "30--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems software); C6155
                 (Computer communications software); C7430 (Computer
                 engineering)",
  corpsource =   "Univ. de La Laguna, Spain",
  keywords =     "broadcasting strategies; computer communications
                 software; intensive communication experiments; LAN;
                 local area networks; measurement; network parameter;
                 parallel processing; parallel virtual machine; PVM
                 algorithms; pvm.mcast; pvm.send; virtual machines",
  pubcountry =   "Germany",
  treatment =    "X Experimental",
}

@InProceedings{Russ:1996:HAT,
  author =       "S. H. Russ and B. Flachs and J. Robinson and B.
                 Heckel",
  title =        "Hector: automated task allocation for {MPI}",
  crossref =     "IEEE:1996:PII",
  pages =        "344--348",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5620L (Local area
                 networks); C6110P (Parallel programming); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings of International Conference on Parallel
                 Processing",
  corpsource =   "Mississippi State Univ., MS, USA",
  keywords =     "automated task allocation; automatic run-time
                 performance optimization; C; computational resources;
                 FORTRAN; Hector; local area networks; message passing;
                 message- passing; MPI; multiprocessing systems;
                 parallel processing; parallel programming; resource
                 allocation; software performance evaluation; task
                 migration; workstation networks",
  sponsororg =   "IEEE Comput. Tech. Committee on Parallel Process.; ACM
                 SIGARCH",
  treatment =    "P Practical",
}

@InProceedings{Santana:1996:PVM,
  author =       "M. S. Santana and P. S. Souza and R. C. Santana and S.
                 S. Souzza",
  title =        "{Parallel Virtual Machine} for {Windows95}",
  crossref =     "Bode:1996:PVM",
  pages =        "288--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  corpsource =   "Inst. de Ciencias Math., Sao Carlos, Brazil",
  keywords =     "(computers); communication; local area networks;
                 message; message passing; network; operating systems;
                 parallel applications; parallel programming; Parallel
                 Virtual Machine for Windows 95; passing environment;
                 personal computers; PVM-W95; speedups; Windows 95
                 operating system; workstations",
  pubcountry =   "Germany",
  treatment =    "P Practical",
  xxauthor =     "M. S. Santana and R. C. Santana and P. S. Souza and S.
                 S. Souza",
}

@Article{Schuele:1996:PLA,
  author =       "J. Schuele",
  title =        "Parallel {Lanczos} Algorithm on a {CRAY-T3D} Combining
                 {PVM} and {SHMEM} Routines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "158--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Schule:1996:PLA,
  author =       "J. Schule",
  title =        "Parallel {Lanczos} algorithm on a {CRAY-T3D} combining
                 {PVM} and {SHMEM} routines",
  crossref =     "Bode:1996:PVM",
  pages =        "158--165",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A7130 (Metal-insulator transitions); A7155J
                 (Localization in disordered structures); C4140 (Linear
                 algebra); C6110P (Parallel programming); C7320 (Physics
                 and chemistry computing)",
  corpsource =   "Inst. for Sci. Comput., Braunschweig, Germany",
  keywords =     "Anderson model; Anderson-; arithmetic; coarse-grain;
                 Cray computers; CRAY-; cubic tungsten bronzes; data
                 distribution; dynamic group; effectiveness;
                 eigenfunctions; eigenvalues and; eigenvector
                 calculation; fine-grain; finite precision; Fock
                 approach; Hamiltonian matrix; HF calculations; Hubbard
                 model; iterations; iterative; Krylov sequence; load
                 imbalance; machines; master-slave paradigm; matrix
                 algebra; membership; metal-insulator transition;
                 methods; Mott-Hubbard model; NaWO/sub 3/; parallel
                 algorithms; parallel Lanczos algorithm; parallelisation
                 strategies; parallelism; physics computing; PVM;
                 resource allocation; rounding errors; roundoff errors;
                 SHMEM routines; sodium compounds; software development
                 costs; stochastic distributions; subroutines; T3D;
                 tridiagonalisation; unrestricted Hartree-; virtual",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Sener:1996:DPP,
  author =       "C. Sener and Y. Paker and A. Kiper",
  title =        "Data-parallel programming on {Helios}, parallel
                 environment and {PVM}",
  crossref =     "Yetongnon:1996:PII",
  volume =       "1",
  pages =        "2--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6120 (File organisation); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Comput. Eng., Middle East Tech. Univ.,
                 Ankara, Turkey",
  keywords =     "C language interface; column-sums; computational
                 complexity; data; data flow computing; data-parallel;
                 data-parallel programming; environments; evaluation;
                 Helios operating system; IBM SP/2 system; image
                 processing; matrix; message passing; network; operating
                 systems; parallel environment; parallel programming;
                 Parallel Virtual Machine; performance; portability;
                 programming; programming tool; PVM; resource
                 allocation; software performance; software portability;
                 software tools; speed-up curves; structures;
                 transputers; type; virtual machines",
  sponsororg =   "ISCA; IEEE Comput. Soc.; IEEE Tech. Committee on
                 Operating Syst.; et al",
  treatment =    "P Practical",
}

@Article{Shyu:1996:ILQ,
  author =       "Shyong Jian Shyu and H. K.-C. Chang and K.-C. Chou",
  title =        "Implementation of a linear quadtree coding scheme on
                 the parallel virtual machine",
  journal =      j-INT-J-HIGH-SPEED-COMPUTING,
  volume =       "8",
  number =       "1",
  pages =        "65--79",
  month =        mar,
  year =         "1996",
  CODEN =        "IHSCEZ",
  ISSN =         "0129-0533",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6140C (Optical information, image and video signal
                 processing); C1250 (Pattern recognition); C5260B
                 (Computer vision and image processing techniques);
                 C5440 (Multiprocessing systems); C6120 (File
                 organisation); C7430 (Computer engineering)",
  corpsource =   "Dept. of Inf. Manage., Ming Chuan Univ., Taipei,
                 Taiwan",
  fjournal =     "International Journal of High Speed Computing",
  keywords =     "data; encoding; high; image encoding problem; image
                 manipulations; image processing; image processing
                 problems; linear quadtree coding; master-slave
                 paradigm; parallel machines; parallel virtual machine;
                 performance computing; quadtrees; structure; virtual
                 machines",
  pubcountry =   "Singapore",
  treatment =    "A Application; P Practical",
}

@InProceedings{Silva:1996:IDS,
  author =       "L. M. Silva and J. G. Silva and S. Chapple",
  title =        "Implementing distributed shared memory on top of
                 {MPI}: the {DSMPI} library",
  crossref =     "IEEE:1996:PFE",
  pages =        "50--57",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings of 4th Euromicro Workshop on Parallel and
                 Distributed Processing",
  corpsource =   "Coimbra Univ., Portugal",
  keywords =     "coherence protocols; consistency; Cray T3D;
                 distributed memory machines; distributed memory
                 systems; distributed shared memory; DSMPI library;
                 message passing; MPI; parallel library; parallel
                 programming; performance; programming interface;
                 scalability; shared memory systems; software libraries;
                 software performance evaluation; software portability;
                 workstation network",
  treatment =    "P Practical",
}

@InProceedings{Sitsky:1996:IMU,
  author =       "D. Sitsky and P. Mackerras and A. Tridgell and D.
                 Walsh",
  title =        "Implementing {MPI} under {AP\slash Linux}",
  crossref =     "IEEE:1996:PSM",
  pages =        "32--39",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "A preliminary MPI library has been implemented for the
                 Fujitsu AP1000+ multicomputer running the AP/Linux
                 operating system. Under this environment, parallel
                 programs may be dedicated to a fixed partition, or a
                 number of parallel programs may share a partition.
                 Therefore, the MPI library has been constructed so that
                 messaging operations can be driven by polling and/or
                 interrupt techniques. It has been found that polling
                 works well when a single parallel program is running on
                 a given partition, and that interrupt-driven
                 communication makes far better use of the machine when
                 multiple parallel programs are executing. Gang
                 scheduling of multiple parallel programs which use
                 polling was found to be relatively ineffective.",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6140D (High level languages); C6150E
                 (General utility programs); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150J
                 (Operating systems)C6115 (Programming support); C6150N
                 (Distributed systems software)",
  conflocation = "Notre Dame, IN, USA; 1-2 July 1996",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "CAP Res. Program, Australian Nat. Univ., Canberra,
                 ACT, Australia",
  keywords =     "AP/Linux; AP/Linux operating system; application
                 program interfaces; communication; debugging;
                 distributed memory systems; extensions; Fujitsu AP1000+
                 multicomputer; gang scheduling; interrupt techniques;
                 interrupt-driven; interrupt-driven communication;
                 interrupts; language issues; libraries; message
                 passing; Message Passing Interface standard; messaging
                 operations; MPI applications; MPI implementations; MPI
                 library; multiple parallel programs; operating system;
                 operating systems (computers); parallel languages;
                 parallel programming; parallel programs; partition;
                 performance evaluation; performance portable parallel
                 programming; polling; processor scheduling; program
                 debugging; software; software libraries; software
                 performance evaluation; software standards; utility
                 programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Sitsky:1996:MLW,
  author =       "D. Sitsky and E. Hayashi",
  title =        "An {MPI} library which uses polling, interrupts and
                 remote copying for the {Fujitsu AP1000+}",
  crossref =     "Li:1996:PSI",
  pages =        "43--49",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing); C6110B (Software engineering
                 techniques); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings Second International Symposium on Parallel
                 Architectures, Algorithms, and Networks (I-SPAN'96)",
  corpsource =   "Dept. of Comput. Sci., Australian Nat. Univ.,
                 Canberra, ACT, Australia",
  keywords =     "Fujitsu AP1000+; interrupts; message passing; MPI
                 library; parallel architectures; performance;
                 performance evaluation; polling; remote copying;
                 software libraries; software portability",
  sponsororg =   "Chinese Nat. Res. Center for Intelligent Comput.
                 Syst.; IEEE Comput. Soc.; IEEE Comput. Soc. Tech.
                 Committee on Parallel Process.; Steering Committee of
                 the Chinese Nat. Hi-Tech Programme; Inf. Process. Soc.
                 Japan; Chinese Comput. Federation; IEICE Inf. and Syst.
                 Soc",
  treatment =    "P Practical",
}

@InProceedings{Sivaraman:1996:AAD,
  author =       "H. Sivaraman and C. S. Raghavendra",
  title =        "{ADDT}: Automatic Data Distribution Tool for Porting
                 Programs to {PVM}",
  crossref =     "El-Rewini:1996:PTN",
  volume =       "1",
  pages =        "557--564",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6140D (High level languages); C6150C
                 (Compilers, interpreters and other processors); C6150N
                 (Distributed systems software)",
  corpsource =   "School of Electr. Eng. and Comput. Sci., Washington
                 State Univ., Pullman, WA, USA",
  keywords =     "ADDT; automatic data distribution tool; communication
                 latency; communication statements; data access; data
                 parallel languages; data partitioning; distributed
                 memory; distributed memory systems; distribution
                 blocks; environment; FORTRAN; Fortran; High
                 Performance; HPF; interpreters; languages; linear
                 optimization problem; nonlocal; optimisation; parallel;
                 parallel programming; program; program compilers;
                 program porting; PVM; shared memory parallel program;
                 shared memory systems; software portability",
  sponsororg =   "Univ. Hawaii; Univ. Hawaii College of Bus. Adm",
  treatment =    "P Practical",
}

@InProceedings{Skjellum:1996:TTM,
  author =       "A. Skjellum and B. Protopopov and S. Hebert",
  title =        "A thread taxonomy for {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "50--57",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110F
                 (Formal methods); C6150E (General utility programs);
                 C6150J (Operating systems); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  keywords =     "API extensions; application program interfaces;
                 Channel Device; computational unit; fine-grain
                 concurrency; formal specification; message passing;
                 minimal portable thread management; MPI; MPICH;
                 multi-threaded thread-safe ADI; non-thread-safe MPI
                 call semantics; resource container; software
                 portability; synchronisation; synchronization
                 mechanisms; thread models; thread safety; thread
                 taxonomy; user-level mechanism; utility programs;
                 Windows NT version",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Smith:1996:UWC,
  author =       "N. P. G. Smith and C. Christopoulos",
  title =        "Utilising workstation clusters with {PVM} for the
                 solution of large {TLM} problems",
  crossref =     "Silvester:1996:SEE",
  pages =        "3--11",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B1130B (Computer-aided circuit analysis and design);
                 B5240 (Transmission line theory); C6110P (Parallel
                 programming); C7410 (Electrical engineering
                 computing)",
  corpsource =   "Numerical Modelling Group, Nottingham Univ., UK",
  keywords =     "data; electrical engineering computing; load
                 balancing; parallel computing; parallel programming;
                 Parallel Virtual Machine; partitioning; PVM;
                 transmission line matrix methods; Transmission Line
                 Modelling; virtual machines; workstation clusters",
  pubcountry =   "UK",
  sponsororg =   "IEE; Univ. Florence",
  treatment =    "P Practical",
}

@Book{Snir:1996:MCR,
  author =       "Marc Snir and Steve W. Otto and Steven Huss-Lederman
                 and David W. Walker and Jack Dongarra",
  title =        "{MPI}: the complete reference",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xii + 336",
  year =         "1996",
  ISBN =         "0-262-69184-1",
  ISBN-13 =      "978-0-262-69184-0",
  LCCN =         "QA76.642.M65 1996",
  bibdate =      "Fri Jan 31 07:16:14 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "US\$27.50",
  acknowledgement = ack-nhfb,
}

@InProceedings{Soch:1996:PCG,
  author =       "M. Soch and J. Trdlicka and P. Tvrdik",
  title =        "{PVM}, computational geometry, and parallel computing
                 course",
  crossref =     "Bode:1996:PVM",
  pages =        "38--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C0220 (Computing education and training); C4240P
                 (Parallel programming and algorithm theory); C4260
                 (Computational geometry); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming); C6130B
                 (Graphics techniques); C7310 (Mathematics computing);
                 C7810C (Computer-aided instruction)",
  corpsource =   "Dept. of Comput. Sci. and Eng., Czech Tech. Univ.,
                 Prague, Czech Republic",
  keywords =     "computational geometry; computer science education;
                 courseware; distributed memory machines; distributed
                 memory systems; educational courses; laboratories;
                 mathematics computing; non-trivial parallel algorithms;
                 parallel algorithms; parallel computing course;
                 parallel programming; programming tool; projects;
                 student PVM; students; SUN; teaching; term projects;
                 upper level undergraduate; workstation lab",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Squyres:1996:CBP,
  author =       "J. M. Squyres and A. Lumsdaine and R. L. Stevenson",
  title =        "A cluster-based parallel image processing toolkit",
  crossref =     "Grinstein:1996:VDE",
  volume =       "2421",
  pages =        "228--239",
  year =         "1996",
  CODEN =        "PSISDG",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. for Sci. Comput., Notre Dame Univ., IN, USA",
  classification = "B6140C (Optical information, image and video signal
                 processing); C5260B (Computer vision and image
                 processing techniques); C6110B (Software engineering
                 techniques); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  keywords =     "Cluster-based computing; Data I/O; Data locality; Load
                 balancing; Message passing interface; MPI standard;
                 Network technologies; Parallel computing resource;
                 Parallel image processing software library; Parallel
                 image processing toolkit; Sequential image processing;
                 Specialized massively parallel computing hardware;
                 Visualization; Workstation clusters",
  thesaurus =    "Computer networks; Image processing; Message passing;
                 Parallel processing; Software libraries; Software
                 tools; Workstations",
}

@Article{Stathopoulos:1996:PIM,
  author =       "Andreas Stathopoulos and Anders B. Ynnerman and
                 Charlotte {Froese Fischer}",
  title =        "A {PVM} Implementation of the {MCHF} Atomic Structure
                 Package",
  journal =      j-IJSAHPC,
  volume =       "10",
  number =       "1",
  pages =        "41--61",
  month =        "Spring",
  year =         "1996",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software); C7310 (Mathematics computing); C7320
                 (Physics and chemistry computing); C7400 (Engineering
                 computing)",
  corpsource =   "Dept. of Comput. Sci., Vanderbilt Univ., Nashville,
                 TN, USA",
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
  keywords =     "algorithmic; atomic data; atomic structure;
                 calculations; computational demands; computing; CPU;
                 disk space; engineering; engineering computing;
                 evaluation; high-end workstation cluster; IBM SP2
                 multiprocessor; improvements; mathematics; MCHF atomic;
                 Multiconfiguration Hartree--Fock package; parallel
                 computers; parallel machines; parallel programming;
                 Parallel Virtual Machine; physics; prime memory;
                 problem size; PVM implementation; PVM programming;
                 science; software packages; software performance;
                 speed; structure package; user interfaces;
                 user-friendly interface",
  treatment =    "A Application; P Practical",
}

@InProceedings{Steed:1996:PPP,
  author =       "M. R. Steed and M. J. Clement",
  title =        "Performance prediction of {PVM} programs",
  crossref =     "IEEE:1996:PII",
  pages =        "803--807",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6150G (Diagnostic, testing, debugging
                 and evaluating systems); C6150N (Distributed systems
                 software)",
  corpsource =   "Dept. of Comput. Sci., Brigham Young Univ., Provo, UT,
                 USA",
  keywords =     "analysis; APACHE; Automated; clusters; debugging;
                 evaluation; local area networks; message passing
                 library; Parallel; parallel; parallel computing;
                 parallel programming; performance; performance tools;
                 program; program debugging; programming; PVM
                 Application Characterization Environment; PVM programs;
                 scalable parallel applications; software libraries;
                 software performance; software performance prediction;
                 software reusability; software tools; Virtual Machine;
                 workstation",
  sponsororg =   "IEEE Comput. Tech. Committee on Parallel Process.; ACM
                 SIGARCH",
  treatment =    "P Practical",
}

@InProceedings{Stellner:1996:CCP,
  author =       "G. Stellner",
  title =        "{CoCheck}: checkpointing and process migration for
                 {MPI}",
  crossref =     "IEEE:1996:PII",
  pages =        "526--531",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5620L (Local area
                 networks); C6150N (Distributed systems software)",
  conftitle =    "Proceedings of International Conference on Parallel
                 Processing",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  keywords =     "checkpointing; CoCheck; consistency; LAN; local area
                 networks; message passing; message passing library;
                 MPI; parallel applications; parallel machines; process
                 migration; resource allocation; single process
                 checkpointer; software libraries; workstation
                 networks",
  sponsororg =   "IEEE Comput. Tech. Committee on Parallel Process.; ACM
                 SIGARCH",
  treatment =    "P Practical",
}

@InProceedings{Stone:1996:RNF,
  author =       "J. Stone and M. Underwood",
  title =        "Rendering of numerical flow simulations using {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "138--141",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C6130B (Graphics
                 techniques); C6150N (Distributed systems software);
                 C6185 (Simulation techniques); C7320 (Physics and
                 chemistry computing)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Missouri Univ., Rolla, MO,
                 USA",
  keywords =     "application program interfaces; computational fluid
                 dynamics code; data visualisation; dedicated graphics
                 workstations; digital simulation; disks; fluid
                 dynamics; IBM SP2; in-place rendering; Intel iPSC/860;
                 Intel Paragon; message passing; Message Passing
                 Interface; MPI; networked graphics workstations;
                 numerical flow simulations; parallel architectures;
                 physics computing; ray tracing; ray tracing library;
                 rendering (computer graphics); run-time visualization;
                 software libraries; workstation networks",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "A Application; P Practical",
}

@InProceedings{Strietzel:1996:PTS,
  author =       "M. Strietzel",
  title =        "Parallel turbulence simulation based on {MPI}",
  crossref =     "Liddell:1996:HPC",
  pages =        "283--289",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A0260 (Numerical approximation and analysis); A0270
                 (Computational techniques); A4710 (General fluid
                 dynamics theory, simulation and other computational
                 methods); A4725 (Turbulent flows, convection, and heat
                 transfer); C4240P (Parallel programming and algorithm
                 theory); C6110P (Parallel programming); C7320 (Physics
                 and chemistry computing)",
  conftitle =    "High-Performance Computing and Networking.
                 International Conference and Exhibition HPCN Europe
                 1996",
  corpsource =   "Zentrum fur Paralleles Rechnen, Koln Univ., Germany",
  keywords =     "direct numerical simulation; divide and conquer
                 method; divide and conquer methods; domain
                 decomposition; flow simulation; large-eddy simulation;
                 message passing; message passing platform; MPI;
                 numerical analysis; parallel algorithms; parallel
                 turbulence simulation; parallelization strategy;
                 physics computing; Poisson equation; three dimensional
                 incompressible Navier--Stokes equation; turbulence;
                 turbulent fluids",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Subramaniam:1996:CLU,
  author =       "Krishnan R. Subramaniam and Suraj C. Kothari and Don
                 Heller",
  title =        "A Communication Library Using Active Messages to
                 Improve Performance of {PVM}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "39",
  number =       "2",
  pages =        "146--152",
  day =          "15",
  month =        dec,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0162",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0162/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0162/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Iowa State Univ., Ames, IA,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "active messages; communication; communication library;
                 controlled; message passing; parallel processing;
                 polling; primitives; signal driven message
                 notification; software libraries; software performance
                 evaluation; virtual machines",
  treatment =    "P Practical",
}

@InProceedings{Sunderam:1996:PSS,
  author =       "V. Sunderam",
  title =        "The {PVM} system: status, trends, and directions",
  crossref =     "Bode:1996:PVM",
  pages =        "68--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software); C7430 (Computer engineering)",
  corpsource =   "Dept. of Math. and Comput. Sci., Emory Univ., Atlanta,
                 GA, USA",
  keywords =     "API; application program interfaces; environments;
                 future directions; heterogeneous; heterogeneous
                 concurrent computing; high performance computing;
                 network computing; parallel programming; Parallel
                 Virtual Machine; programming model; PVM system; robust
                 portable implementations; software framework; virtual
                 machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Suttner:1996:SPB,
  author =       "C. B. Suttner",
  title =        "{SPTHEO} --- a {PVM-based} parallel theorem prover",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "116--125",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C1160 (Combinatorial mathematics); C1180
                 (Optimisation techniques); C4210 (Formal logic); C6110P
                 (Parallel programming); C7430 (Computer engineering)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "communication aspects; environments; high latency;
                 message passing; parallel programming; parallel search;
                 parallelization; portable implementation; proof; PVM
                 based parallel theorem prover; PVM message passing
                 system; search problems; SETHEO; SPS model; SPTHEO;
                 system; theorem proving; virtual machines; workstation
                 networks",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Theodoropoulos:1996:ESP,
  author =       "P. Theodoropoulos and G. Manis and P. Tsanakas and G.
                 Papakonstantinou",
  title =        "Extending synchronization {PVM} mechanisms",
  crossref =     "Bode:1996:PVM",
  pages =        "315--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150J (Operating
                 systems); C6150N (Distributed systems software); C7430
                 (Computer engineering)",
  corpsource =   "Dept. of Electr. Eng., Nat. Tech. Univ. of Athens,
                 Greece",
  keywords =     "barriers; global semaphores; message; message passing;
                 operating system; operating systems (computers); Orchid
                 platform; parallel; parallel virtual machine; passing;
                 primitives; programming; synchronisation;
                 synchronization; virtual machines",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Touhafi:1996:DPC,
  author =       "A. Touhafi and W. Brissinck and E. F. Dirkx",
  title =        "Development of {PVM} code for a low latency switch
                 based interconnect",
  crossref =     "Bode:1996:PVM",
  pages =        "229--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6150J (Operating systems); C6150N
                 (Distributed systems software); C7430 (Computer
                 engineering)",
  corpsource =   "V.U.B. TW-INFO, Brussels, Belgium",
  keywords =     "communication; device driver; device drivers;
                 distributed parallel computing; end latency; Ethernet;
                 fast switch based network; intensive applications;
                 interchanged messages; interconnection networks; low
                 latency switch based interconnect; machines; message;
                 message passing; message passing tool; multiprocessor;
                 parallel programming; passing; PVM code; PVM routines;
                 small end-to-; virtual",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Article{Twerda:1996:PIT,
  author =       "A. Twerda and A. P. {Van den Berg} and A. J. {Van der
                 Steen}",
  title =        "Parallel implementation of time dependent
                 {Rayleigh-Benard} convection",
  journal =      j-SUPERCOMPUTER,
  volume =       "12",
  number =       "2",
  pages =        "36--47",
  month =        mar,
  year =         "1996",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Wed Mar 18 08:37:01 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Geophys., Utrecht Univ., Netherlands",
  classification = "A4720 (Hydrodynamic stability and instability);
                 C5440 (Multiprocessing systems); C7320 (Physics and
                 chemistry computing)",
  corpsource =   "Dept. of Geophys., Utrecht Univ., Netherlands",
  fjournal =     "Supercomputer",
  keywords =     "distributed memory parallel systems; Gray T3D; message
                 passing; message passing model; parallel
                 implementation; parallel models; parallel processing;
                 physics computing; PVM; Rayleigh-Benard instability;
                 scalability; time dependent Rayleigh-Benard
                 convection",
  pubcountry =   "Netherlands",
  treatment =    "A Application; P Practical",
}

@InProceedings{Uhl:1996:PIC,
  author =       "A. Uhl and J. Hammerle",
  title =        "Parallel image compression on a workstation cluster
                 using {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "301--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B6120B (Codes); B6140C (Optical information, image
                 and video signal processing); C1250 (Pattern
                 recognition); C4240P (Parallel programming and
                 algorithm theory); C5440 (Multiprocessing systems)",
  corpsource =   "Dept. of Comput. Sci. and Syst. Anal., Salzburg Univ.,
                 Austria",
  keywords =     "data compression; FDDI; fractal image compression;
                 image coding; interconnected workstation-cluster;
                 parallel algorithms; parallel image compression;
                 parallel machines; parallel meta-algorithm; PVM;
                 virtual machines; workstation cluster",
  pubcountry =   "Germany",
  treatment =    "A Application; P Practical",
}

@InProceedings{Wagner:1996:GSG,
  author =       "T. Wagner and C. Kueblbeck and C. Schittko",
  title =        "Genetic selection and generation of textural features
                 with {PVM}",
  crossref =     "Bode:1996:PVM",
  pages =        "305--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B0260 (Optimisation techniques); B6140C (Optical
                 information, image and video signal processing); C1180
                 (Optimisation techniques); C4240P (Parallel programming
                 and algorithm theory); C5260B (Computer vision and
                 image processing techniques); C7330 (Biology and
                 medical computing); C7430 (Computer engineering)",
  corpsource =   "Fraunhofer-Institut fur Integrierte Schaltungen,
                 Erlangen, Germany",
  keywords =     "algorithms; cell identification; Gallops PVM package;
                 genetic algorithms; genetic selection; medical image
                 processing; medical imaging; parallel; PVM; quality
                 control; surface inspection; systems; textural features
                 generation; textural image features; textured images;
                 tumor; virtual machines",
  pubcountry =   "Germany",
  treatment =    "A Application; P Practical",
}

@Article{Wagner:1996:PMM,
  author =       "J. C. Wagner and A. Haghighat",
  title =        "Parallel {MCNP Monte Carlo} transport calculations
                 with {MPI}",
  journal =      j-TRANS-AM-NUCL-SOC,
  volume =       "75",
  number =       "??",
  pages =        "338--339",
  month =        "????",
  year =         "1996",
  CODEN =        "TANSAO",
  ISSN =         "0003-018X",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A0250 (Probability theory, stochastic processes, and
                 statistics); A0270 (Computational techniques); A2820H
                 (Neutron diffusion); A2841C (Computer codes for fission
                 reactor theory and design); C1140G (Monte Carlo
                 methods); C4240P (Parallel programming and algorithm
                 theory); C6110P (Parallel programming); C7470 (Nuclear
                 engineering computing)",
  conftitle =    "American Nuclear Society and the European Nuclear
                 Society 1996 International Conference on the Global
                 Benefits of Nuclear Technology and the Embedded Topical
                 Meetings. Low- Level Radiation Health Effects, DD and
                 R: Worldwide Experience- DD and R, What Does it Mean,
                 and International Nuclear Policy Issues (papers in
                 summary form only received)",
  corpsource =   "Pennsylvania State Univ., University Park, PA, USA",
  fjournal =     "Transactions of the American Nuclear Society",
  keywords =     "fission reactor kinetics; high-speed communication
                 switches; message passing; message-passing interface;
                 message-passing library package; message-passing
                 software package; Monte Carlo methods; MPI; neutron
                 transport theory; nuclear engineering computing;
                 parallel algorithms; parallel MCNP Monte Carlo
                 transport calculations; parallel Monte Carlo; parallel
                 virtual machine; workstation clusters",
  treatment =    "P Practical",
}

@TechReport{Walker:1996:MFA,
  author =       "David W. Walker",
  title =        "{MPI}: from Fundamentals to Applications",
  institution =  inst-ORNL,
  address =      inst-ORNL:adr,
  year =         "1996",
  bibdate =      "Tue Jan 16 08:29:47 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/~walker/mpi/SLIDES/mpi-tutorial.html",
}

@Misc{Walker:1996:MP,
  author =       "David W. Walker",
  title =        "{MPI2} Proposals",
  howpublished = "World-Wide Web",
  year =         "1996",
  bibdate =      "Tue Jan 16 08:33:57 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/~walker/mpi/mpi2-proposals.html",
}

@Article{Walker:1996:MSM,
  author =       "D. W. Walker and J. J. Dongarra",
  title =        "{MPI}: a standard Message Passing Interface",
  journal =      j-SUPERCOMPUTER,
  volume =       "12",
  number =       "1",
  pages =        "56--68",
  month =        jan,
  year =         "1996",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  classification = "C5640 (Protocols); C6150N (Distributed systems
                 software); C6155 (Computer communications software)",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  fjournal =     "Supercomputer",
  keywords =     "collective communication routine; Collective
                 communication routine; computer interface; Computer
                 interface; computer interfaces; computer standard;
                 Computer standard; distributed processing; Distributed
                 processing; Europe; massively parallel computer;
                 Massively parallel computer; message passing; message
                 passing interface; Message passing interface; MPI;
                 network interface; Network interface; network
                 interfaces; parallel processing; Parallel processing;
                 point-to-point; Point-to-point; protocols; software
                 standards; standards; United States; workstation
                 network; Workstation network",
  pubcountry =   "Netherlands",
  thesaurus =    "Computer interfaces; Distributed processing; Message
                 passing; Network interfaces; Parallel processing;
                 Protocols; Software standards; Standards",
  treatment =    "P Practical",
}

@Article{Walker:1996:RBC,
  author =       "D. W. Walker and S. W. Otto",
  title =        "Redistribution of block-cyclic data distributions
                 using {MPI}",
  journal =      j-CPE,
  volume =       "8",
  number =       "9",
  pages =        "707--728",
  month =        nov,
  year =         "1996",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:27 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=23305",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6140D (High level languages); C6150N (Distributed
                 systems software)",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "block-cyclic data distributions; block-cyclic fashion;
                 broadcast; collective communication operations;
                 FORTRAN; High Performance Fortran; IBM SP-1; Intel
                 Paragon; load balancing properties; message passing;
                 message passing algorithms; MPI; MPI-ALLTOALL; parallel
                 algorithms; processor scheduling; redistribution
                 operation; resource allocation",
  treatment =    "T Theoretical or Mathematical",
}

@InProceedings{Wedemeijer:1996:PSA,
  author =       "H. Wedemeijer and H. L. H. Cox and D. J. Verschuur and
                 I. L. Ritsema",
  title =        "Parallelisation of seismic algorithms using {PVM} and
                 {FORGE}",
  crossref =     "Liddell:1996:HPC",
  pages =        "352--??",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "A9130 (Seismology); A9130R (Controlled source
                 seismology); A9365 (Data and information; A9385
                 (Instrumentation and techniques for geophysical,
                 hydrospheric and lower atmosphere research);
                 acquisition, processing, storage and dissemination in
                 geophysics); C5260B (Computer vision and image
                 processing techniques); C6110P (Parallel programming);
                 C7340 (Geophysics computing)",
  corpsource =   "TNO Inst. of Appl. Geosci., Delft, Netherlands",
  keywords =     "algorithms; Earth subsurface; explosion seismology;
                 FORGE; geophysical prospecting; geophysical signal
                 processing; geophysical techniques; geophysics
                 computing; imaging; implementation; measurement
                 technique; method; optimisation; parallel; parallel
                 programming; parallelisation; prospecting; PVM; seismic
                 algorithm; seismic reflection profiling; seismology;
                 signal processing",
  pubcountry =   "Germany",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Wilson:1996:SMS,
  author =       "G. C. Wilson and T. H. Wood and J. L. Zyskind and J.
                 W. Sulhoff and J. E. Johnson and T. Tanbun-Ek and P. A.
                 Morton",
  title =        "{SBS} and {MPI} suppression in analogue systems with
                 integrated electroabsorption modulator\slash {DFB}
                 laser transmitters",
  journal =      j-ELECT-LETTERS,
  volume =       "32",
  number =       "16",
  pages =        "1502--1504",
  month =        "????",
  year =         "1996",
  CODEN =        "ELLEAK",
  ISSN =         "0013-5194 (print), 1350-911X (electronic)",
  ISSN-L =       "0013-5194",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "B4150 (Electro- optical devices); B4360 (Laser
                 applications); B6260 (Optical links and equipment);
                 B6430D (CATV and wired systems)",
  corpsource =   "Lucent Technol., Holmdel, NJ, USA",
  fjournal =     "Electronics Letters",
  keywords =     "1.7 GHz; analogue systems; cable television; CATV
                 systems; chirp modulation; DFB laser transmitters;
                 distributed feedback lasers; dithering; electro-optical
                 modulation; electroabsorption; electroabsorption
                 modulator; excess noise; integrated modulator/laser
                 transmitters; interference suppression; laser beam
                 applications; laser bias current; low-chirp modulation;
                 MPI suppression; multipath interference; narrow
                 linewidth sources; optical fibre communication; optical
                 noise; optical transmitters; SBS suppression;
                 stimulated Brillouin scattering",
  treatment =    "X Experimental",
}

@Article{Wismueller:1996:SBV,
  author =       "R. Wismueller",
  title =        "State Based Visualization of {PVM} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "91--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wismueller:1996:TSI,
  author =       "R. Wismueller and T. Ludwig",
  title =        "The Tool-Set --- An Integrated Tool Environment for
                 {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "??",
  number =       "1067",
  pages =        "1029--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Wismuller:1996:SBV,
  author =       "R. Wismuller",
  title =        "State based visualization of {PVM} applications",
  crossref =     "Bode:1996:PVM",
  pages =        "",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6110V (Visual
                 programming); C6130B (Graphics techniques); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems); C7430 (Computer engineering)",
  corpsource =   "Inst. fur Inf., Tech. Univ. Munchen, Germany",
  keywords =     "animated sequence; behavior; consistent; critical
                 issue; debugging; dynamic; event ordering algorithm;
                 global; global clock; optimization; parallel
                 programming; parallel programs; program diagnostics;
                 program flow; PVM applications; state based
                 visualization; state based visualizer; states; virtual
                 machines; VISTOP; visual programming; visualization;
                 visualization tool",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Wismuller:1996:TSI,
  author =       "R. Wismuller and T. Ludwig",
  title =        "The {Tool Set} --- an integrated tool environment for
                 {PVM}",
  crossref =     "Liddell:1996:HPC",
  pages =        "",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support)",
  corpsource =   "Lehrstuhl fur Rechnertechnik und Rechnerorganisation,
                 Tech. Univ. Munchen, Germany",
  keywords =     "checkpointing; debugging; deterministic execution;
                 development; integrated tool environment; load
                 balancing; parallel I/O; parallel program; parallel
                 programming; performance analysis; program flow
                 visualization; programming environments; PVM; software;
                 Tool Set; tools",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@InProceedings{Wolf:1996:CFS,
  author =       "K. Wolf and E. Brakkee",
  title =        "Coupling fluids and structures codes on {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "130--137",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6115
                 (Programming support); C6150N (Distributed systems
                 software); C6185 (Simulation techniques)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "German Nat. Res. Center for Comput. Sci., St.
                 Augustin, Germany",
  keywords =     "address-spaces; API; application program interface;
                 application program interfaces; computational power;
                 dedicated neighborhoods; digital simulation; dynamic
                 process sets; fluids codes; industrial simulation
                 applications; message passing; message passing
                 interface; message passing libraries; MPI; MPI-WORLDs;
                 parallel systems; portability; software libraries;
                 software portability; standalone address-spaces;
                 storage management; structures codes; synchronisation",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Worley:1996:MPE,
  author =       "P. H. Worley",
  title =        "{MPI} performance evaluation and characterization
                 using a compact application benchmark code",
  crossref =     "IEEE:1996:PSM",
  pages =        "170--177",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "application program interface; application program
                 interfaces; communication library; communication
                 protocol; communication routines;
                 communication-intensive application code; compact
                 application benchmark code; Cray Research T3D; IBM SP2;
                 Intel Paragon; message passing; message passing
                 standard; MPI; parallel benchmark code; parallel
                 machines; PSTSWM; software libraries; software
                 performance evaluation; software standards",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{Xiong:1996:BID,
  author =       "Jianxin Xiong and Dingxing Wang and Weimin Zheng and
                 Meiming Shen",
  title =        "{BUSTER}: an integrated debugger for {PVM}",
  crossref =     "IEEE:1996:PIS",
  pages =        "",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6110P (Parallel
                 programming); C6115 (Programming support); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems); C6150N (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci. and Technol., Tsinghua Univ.,
                 Beijing, China",
  keywords =     "BUSTER debugger; communication; debugging; global
                 states; integrated parallel debugger; local area
                 networks; nondeterminism; parallel; parallel
                 programming; performance debugging; program;
                 programming environments; PVM programming environment;
                 related errors; workstation clusters; workstation
                 network; workstations",
  treatment =    "P Practical",
}

@Article{Xu:1996:MCO,
  author =       "Zhiwei Xu and Kai Hwang",
  title =        "Modeling Communication Overhead: {MPI} and {MPL}
                 Performance on the {IBM SP2}",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "4",
  number =       "1",
  pages =        "9--24",
  month =        "Spring",
  year =         "1996",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/88.481662",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Thu Apr 10 19:14:33 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing)",
  corpsource =   "Acad. Sinica, Beijing, China",
  fjournal =     "IEEE parallel and distributed technology: systems and
                 applications",
  keywords =     "architectural bottlenecks; collective; collective
                 communication; collective computation; communication;
                 communication overhead modelling; communication
                 performance evaluation; computation; IBM computers; IBM
                 Message-Passing; IBM Message-Passing Library; IBM SP2;
                 Library; machine size; massively parallel; massively
                 parallel processors; message length; message passing;
                 Message-Passing Interface; message-passing
                 multicomputers; message-passing operations; MPI
                 performance; MPL; MPL performance; optimization;
                 overhead-; overhead- quantifying method; parallel
                 applications; parallel applications optimization;
                 parallel machines; performance; performance evaluation;
                 point-to-point; point-to-point communication;
                 processors; quantifying method; timing; timing
                 measurements",
  treatment =    "X Experimental",
}

@Article{Yoon:1996:WBP,
  author =       "D.-K. Yoon and J.-L. Gaudiot",
  title =        "Worker-Based Parallel Computing on {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1123",
  pages =        "506--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6150N (Distributed
                 systems software)",
  corpsource =   "Dept. of Electr. Eng. Syst., Univ. of Southern
                 California, Los Angeles, CA, USA",
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "asynchronous tasks; high performance computing;
                 message passing; networks of workstations; parallel
                 function calls; parallel processing; parallel
                 processing subsystem; Parallel Virtual Machine;
                 primitives; run-time system; software package; software
                 packages; user application programs; worker-based
                 parallel computing",
  pubcountry =   "Germany",
  treatment =    "P Practical",
  xxpages =      "2--??",
  xxvolume =     "1",
}

@Article{Yuasa:1996:RPG,
  author =       "F. Yuasa and S. Kawabata and T. Ishikawa and D.
                 Perret-Gallix and T. Kaneko",
  title =        "Running {PVM-GRACE} on Workstation Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1156",
  pages =        "335--??",
  month =        "????",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems software); C7320
                 (Physics and chemistry computing)",
  corpsource =   "Nat. Lab. for High Energy Phys., Ibaraki, Japan",
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "automatic Feynman diagram computation; event
                 generation; Feynman diagrams; high energy physics;
                 parallel machines; parallel virtual machine; physics
                 computing; PVM-GRACE; software packages; software
                 packet; virtual machines; workstation clusters",
  pubcountry =   "Germany",
  treatment =    "A Application; P Practical",
}

@InProceedings{Zambonelli:1996:EPP,
  author =       "F. Zambonelli and M. Pugassi and L. Leonardi and N.
                 Scarabottolo",
  title =        "Experiences on porting a {Parallel Objects}
                 environment from a transputer network to a {PVM-based}
                 system",
  crossref =     "IEEE:1996:PFE",
  pages =        "",
  year =         "1996",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110J
                 (Object-oriented programming); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  corpsource =   "Dipartimento di Elettronica Inf. e Sistemistica,
                 Bologna Univ., Italy",
  keywords =     "computer aided software engineering; heterogeneous
                 computer networks; massively parallel architecture;
                 object-oriented; parallel object-oriented programming;
                 Parallel Objects; parallel programming; programming;
                 programming environments; PVM environment; software
                 portability",
  treatment =    "P Practical",
}

@TechReport{Zhou:1996:FMP,
  author =       "Honbo Zhou and Al Geist",
  title =        "Faster Message Passing in {PVM}",
  institution =  "Mathematical Sciences Section, " # inst-ORNL,
  address =      inst-ORNL:adr,
  year =         "1996",
  pages =        "7",
  bibdate =      "Tue Jan 16 08:18:15 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.epm.ornl.gov/~zhou/patm.ps",
}

@Article{Adamo:1997:AOO,
  author =       "J.-M. Adamo",
  title =        "{ARCH}, An Object Oriented {MPI}-Based Library for
                 Asynchronous and Loosely Synchronous Parallel System
                 Programming",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "67--74",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Aguilar:1997:PMS,
  author =       "J. Aguilar and T. Jimenez",
  title =        "A Processors Management System for {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "158--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ahmad:1997:EVP,
  author =       "Ishfaq Ahmad",
  title =        "{Express} versus {PVM}: a performance comparison",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "23",
  number =       "6",
  pages =        "783--812",
  day =          "20",
  month =        jun,
  year =         "1997",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:15:27 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1997&volume=23&issue=6;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1997&volume=23&issue=6&aid=1138",
  acknowledgement = ack-nhfb,
  affiliation =  "Hong Kong Univ of Science and Technology",
  affiliationaddress = "Kowloon, Hong Kong",
  classification = "716.1; 722.4; 723; 723.1",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "Computer programming; Computer software portability;
                 Computer workstations; Data communication systems;
                 Hypercube computers; Interprocessor communication;
                 Parallel algorithms; Parallel processing systems;
                 Parallel virtual machine",
}

@Article{Alexandrov:1997:PMC,
  author =       "V. Alexandrov and K. Chan and A. Gibbons and W.
                 Rytter",
  title =        "On the {PVM\slash MPI} Computations of Dynamic
                 Programming Recurrences",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "305--312",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Alfaro:1997:FDW,
  author =       "F. J. Alfaro and J. A. Gallud and J. L. Sanchez",
  title =        "A Function to Dynamic Workload Allocation in
                 Distributed Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "219--225",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Alonso:1997:PBB,
  author =       "J. L. Alonso and H. Schmidt and V. N. Alexandrov",
  title =        "Parallel Branch and Bound Algorithms for Integer and
                 Mixed Integer Linear Programming Problems under {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "313--320",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Anonymous:1997:TNR,
  author =       "Anonymous",
  title =        "Technology News \& Reviews: {Chemkin} software;
                 {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab};
                 {Java} products; {Scientific WorkPlace 3.0}",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "4",
  number =       "4",
  pages =        "75--??",
  month =        oct # "\slash " # dec,
  year =         "1997",
  CODEN =        "ISCEE4",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@Article{Aversa:1997:MDP,
  author =       "R. Aversa and G. Iannello and N. Mazzocca",
  title =        "An {MPI} Driven Parallelization Strategy for Different
                 Computing Platforms: a Case Study",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "401--408",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bala:1997:PVQ,
  author =       "P. Bala and T. Clark and P. Grochowski and B. Lesyng",
  title =        "Parallel Version of a Quantum Classical Molecular
                 Dynamics Code for Complex Molecular and Biomolecular
                 Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "409--416",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Baraglia:1997:IPW,
  author =       "R. Baraglia and M. Cosso and D. Laforenza and M.
                 Nicosia",
  title =        "Integrating {PVaniM} into {WAMM} for Monitoring
                 Meta-Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "226--233",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Barbosa:1997:EUW,
  author =       "J. G. Barbosa and A. J. Padilha and J.-P. Madier and
                 T. Neubert",
  title =        "Experiments on Using {WPVM} for Industrial Visual
                 Inspection Problems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "828--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Beazley:1997:EMP,
  author =       "D. M. Beazley and P. S. Lomdahl",
  title =        "Extensible message passing application development and
                 debugging with {Python}",
  crossref =     "IEEE:1997:PIP",
  pages =        "650--655",
  year =         "1997",
  bibdate =      "Thu May 21 19:02:04 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture)C5440 (Multiprocessing
                 systems); C6110P (Parallel programming); C6115
                 (Programming support); C6140D (High level languages);
                 C6150C (Compilers, interpreters and other processors);
                 C6150G (Diagnostic, testing, debugging and evaluating
                 systems); C6150N (Distributed systems software)",
  conftitle =    "Proceedings 11th International Parallel Processing
                 Symposium",
  corpsource =   "Dept. of Comput. Sci., Utah Univ., Salt Lake City, UT,
                 USA",
  keywords =     "application specific debugging; CM-5; Cray T3D;
                 extensible message passing application debugging;
                 extensible message passing application development;
                 interpreted object oriented scripting language;
                 large-scale message passing applications; message
                 passing; message passing program writing; molecular
                 dynamics application; MPI; multiprocessing systems;
                 object-oriented languages; parallel machines; parallel
                 programming; program debugging; program interpreters;
                 Python parallelisation; rapid prototyping; software
                 prototyping; Sun multiprocessor servers",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Parallel
                 Process.; ACM SIGARCH; Eur. Assoc. Theor. Comput. Sci.
                 (EATCS); Swiss Special Interest Group on Parallelism
                 (SIPAR); SPPEDUP Soc",
  treatment =    "P Practical",
}

@Article{Beisel:1997:EMD,
  author =       "T. Beisel and E. Gabriel and M. Resch",
  title =        "An Extension to {MPI} for Distributed Computing on
                 {MPPs}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "75--82",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bendtsen:1997:RLS,
  author =       "C. Bendtsen and Z. Zlatev",
  title =        "Running Large-Scale Air Pollution Models on Message
                 Passing Machines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "417--426",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bhandarkar:1997:CRP,
  author =       "Suchendra M. Bhandarkar and Salem Machaka",
  title =        "Chromosome Reconstruction from Physical Maps Using a
                 Cluster of Workstations",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "11",
  number =       "1",
  pages =        "61--86",
  month =        mar,
  year =         "1997",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1023/A:1007913429509",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 6 12:13:06 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=11&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/issuetoc.htm/0920-8542+11+1+1997",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=11&issue=1&spage=61;
                 http://www.wkap.nl/oasis.htm/141471",
  acknowledgement = ack-nhfb,
  classification = "C1180 (Optimisation techniques); C6110P (Parallel
                 programming); C6150N (Distributed systems software);
                 C7330 (Biology and medical computing); C7430 (Computer
                 engineering)",
  corpsource =   "Dept. of Comput. Sci., Georgia Univ., Athens, GA,
                 USA",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Aspergillus nidulans; biology computing; central
                 computational problem; Chromosome IV; chromosome
                 reconstruction; classical NP complete optimal linear
                 arrangement problem; clonal data; clone ordering;
                 genetics; genomic library; heterogeneous collection;
                 Markov chain decomposition; microcanonical annealing;
                 networked computers; parallel algorithms; Parallel
                 Virtual Machine; physical maps; PVM system; simulated
                 annealing; single monolithic parallel computing
                 resource; software system; Unix workstations; virtual
                 machines; workstation cluster",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@Article{Blackford:1997:PEN,
  author =       "L. S. Blackford and A. Cleary and A. Petitet and R. C.
                 Whaley and J. Demmel and I. Dhillon and H. Ren and K.
                 Stanley and J. Dongarra and S. Hammarling",
  title =        "Practical Experience in the Numerical Dangers of
                 Heterogeneous Computing",
  journal =      j-TOMS,
  volume =       "23",
  number =       "2",
  pages =        "133--147",
  month =        jun,
  year =         "1997",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/264029.264030",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/toms/1997-23-2/p133-blackford/",
  abstract =     "Special challenges exist in writing reliable numerical
                 library software for heterogeneous computing
                 environments. Although a lot of software for
                 distributed-memory parallel computers has been written,
                 porting this software to a network of workstations
                 requires careful consideration. The symptoms of
                 heterogeneous computing failures can range from
                 erroneous results without warning to deadlock. Some of
                 the problems are straightforward to solve, but for
                 others the solutions are not so obvious, or incur an
                 unacceptable overhead. Making software robust on
                 heterogeneous systems often requires additional
                 communication. We describe and illustrate the problems
                 encountered during the development of ScaLAPACK and the
                 NAG Numerical PVM Library. Where possible, we suggest
                 ways to avoid potential pitfalls, or if that is not
                 possible, we recommend that the software not be used on
                 heterogeneous networks.",
  acknowledgement = ack-rfb # " and " # ack-kr,
  fjournal =     "ACM Transactions on Mathematical Software",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "distributed-memory systems, floating-point arithmetic,
                 heterogeneous processor networks, message passing,
                 numerical software, reliability",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming, Distributed programming. {\bf
                 G.1.0} Mathematics of Computing, NUMERICAL ANALYSIS,
                 General, Computer arithmetic. {\bf G.1.0} Mathematics
                 of Computing, NUMERICAL ANALYSIS, General, Parallel
                 algorithms.",
}

@Article{Blaheta:1997:PIP,
  author =       "R. Blaheta and O. Jakl and J. Stary",
  title =        "{PVM}-Implementation of the {PCG} Method with
                 Displacement Decomposition",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "321--328",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bozas:1997:PED,
  author =       "G. Bozas and M. Fleischhauer and S. Zimmermann",
  title =        "{PVM} Experiences in Developing the {MIDAS} Parallel
                 Database System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "427--434",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bramley:1997:TNR,
  author =       "Randall Bramley",
  title =        "Technology News \& Reviews: {Chemkin} software;
                 {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab};
                 {Java} products; {Scientific WorkPlace 3.0}",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "4",
  number =       "4",
  pages =        "75--78",
  month =        oct # "\slash " # dec,
  year =         "1997",
  CODEN =        "ISCEE4",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/java.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@Article{Bruck:1997:EMP,
  author =       "Jehoshua Bruck and Danny Dolev and Ching-Tien Ho and
                 Marcel-C{\u{a}}t{\u{a}}lin Ro{\c{s}}u and Ray Strong",
  title =        "Efficient Message Passing Interface ({MPI}) for
                 Parallel Computing on Clusters of Workstations",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "1",
  pages =        "19--34",
  day =          "10",
  month =        jan,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1267",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1267/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1267/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1267/production/ref",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5620L (Local area
                 networks); C6110P (Parallel programming); C6115
                 (Programming support)",
  corpsource =   "California Inst. of Technol., Pasadena, CA, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "clusters of; collective communication functionality;
                 industrial; level reliable transport protocol; local
                 area networks; message passing; message passing
                 interface; MPI-CCL layer; parallel computing; parallel
                 programming; parallel programming environments;
                 point-to-point communication; programming environments;
                 standard; standards; user-; workstations",
  treatment =    "A Application; P Practical",
}

@Article{Brune:1997:HMP,
  author =       "Matthias Brune and J{\"o}rn Gehring and Alexander
                 Reinefeld",
  title =        "Heterogeneous Message Passing and a Link to Resource
                 Management",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "11",
  number =       "4",
  pages =        "355--369",
  month =        dec,
  year =         "1997",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1023/A:1007966723231",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 6 12:13:07 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=11&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/issuetoc.htm/0920-8542+11+4+1997",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=11&issue=4&spage=355;
                 http://www.wkap.nl/oasis.htm/147011",
  acknowledgement = ack-nhfb,
  classification = "C5640 (Protocols); C6150J (Operating systems);
                 C6150N (Distributed systems software)",
  corpsource =   "Paderborn Univ., Germany",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "communication protocols; computer resource management;
                 message passing; MPI; parallel process communication;
                 portability; process mapping; processor scheduling;
                 protocols; PVM; resource allocation; resource
                 management",
  pubcountry =   "Netherlands",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Bubak:1997:EPA,
  author =       "M. Bubak and W. Funika and J. Moscinski",
  title =        "Evaluation of Parallel Application's Behavior in
                 Message Passing Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "234--241",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Carvalho:1997:PCC,
  author =       "L. M. R. Carvalho and J. M. L. M. Palma",
  title =        "Parallelization of a {CFD} Code Using {PVM} and Domain
                 Decomposition Techniques",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1215",
  pages =        "247--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Aug 22 11:59:49 MDT 1997",
  bibsource =    "ftp://ftp.math.utah.edu/pub/bibnet/subjects/domain-decomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/lncs1997a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ciegis:1997:NID,
  author =       "R. Ciegis and R. Sablinskas and J. Wasniewski",
  title =        "Numerical Integration on Distributed-Memory Parallel
                 Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "329--336",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Clematis:1997:DNL,
  author =       "A. Clematis and A. Coda and M. Spagnuolo and M.
                 Mineter",
  title =        "Developing Non-Local Iterative Parallel Algorithms for
                 {GIS} on {Cray T3D} Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "435--442",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cotronis:1997:MPP,
  author =       "J. Y. Cotronis",
  title =        "Message-Passing Program Development by Ensemble",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "242--249",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Demuynck:1997:DOD,
  author =       "K. Demuynck and J. Broeckhove and F. Arickx",
  title =        "Dynamic Optimization of a Distributed {VR} System by
                 Network-Balancing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "443--450",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Derakhshan:1997:PEP,
  author =       "M. Derakhshan and S. Hammarling and A. Krommer",
  title =        "{PINEAPL}: a {European} Project on {Parallel
                 Industrial Numerical Applications and Portable
                 Libraries}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "337--342",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{DiMartino:1997:IPD,
  author =       "B. {Di Martino} and A. Mazzeo and N. Mazzocca and U.
                 Villano",
  title =        "Interaction Patterns Detection in {PVM} Programs to
                 Support Simulation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "250--256",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{DiMartino:1997:MDH,
  author =       "V. {Di Martino} and G. Ruocco",
  title =        "Molecular Dynamics on Hybrid Memory Machines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "451--456",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{DiNapoli:1997:DCA,
  author =       "C. {Di Napoli} and M. Giordano and M. M. Furnari",
  title =        "Distributed and Cooperative Applications in {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "83--90",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dongarra:1997:BCA,
  author =       "J. J. Dongarra and F. Desprez and A. Petitet and C.
                 Randriamaro",
  title =        "Block-Cyclic Array Redistribution on Networks of
                 Workstations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "343--350",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Dongarra:1997:CSD,
  author =       "J. J. Dongarra and S. Hammarling and A. Petitet",
  title =        "Case studies on the development of {ScaLAPACK} and the
                 {NAG} Numerical {PVM} Library",
  crossref =     "Boisvert:1997:QNS",
  pages =        "236--248",
  year =         "1997",
  bibdate =      "Thu Sep 16 09:48:36 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/utk/papers/woco96/woco96.html;
                 http://www.netlib.org/utk/papers/woco96/woco96.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/woco96.pdf",
  acknowledgement = ack-nhfb,
}

@InProceedings{Dou:1997:ISV,
  author =       "Yong Dou and Zhengbing Pang and Xingming Zhou",
  title =        "Implementing a software virtual shared memory on
                 {PVM}",
  crossref =     "IEEE:1997:APD",
  pages =        "",
  year =         "1997",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6120 (File organisation); C6140D (High level
                 languages); C7430 (Computer engineering)",
  corpsource =   "Dept. of Comput. Sci., Changsha Inst. of Technol.,
                 Hunan, China",
  keywords =     "distributed; FORTRAN; FORTRAN language; GKD-VSM;
                 memory environments; multithread scheme; parallel
                 programming; parallel programming model; Prefetch and
                 Poststore; programming environments; PVM; shared
                 memory; software overhead; software virtual shared
                 memory; synchronisation; user-level; virtual machines;
                 virtual storage",
  treatment =    "P Practical",
}

@Article{Exbrayat:1997:OPS,
  author =       "M. Exbrayat and H. Kosch",
  title =        "Offering Parallelism to a Sequential Database
                 Management System on a Network of Workstations Using
                 {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "457--435",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fachat:1997:IEB,
  author =       "Andr{\'e} Fachat and Karl Heinz Hoffmann",
  title =        "Implementation of {Ensemble-Based Simulated Annealing}
                 with dynamic load balancing under {MPI}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "107",
  number =       "1--3",
  pages =        "49--53",
  month =        dec,
  year =         "1997",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(97)00096-9",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 21:30:21 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465597000969",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@TechReport{Fagg:1997:HMAa,
  author =       "G. Fagg and J. Dongarra and A. Geist",
  title =        "Heterogeneous {MPI} Application Interoperation and
                 Process Management under {PVMPI}",
  type =         "Technical report",
  number =       "CS-97-???",
  institution =  inst-UTK,
  address =      inst-UTK:adr,
  month =        jun,
  year =         "1997",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/utk/papers/pvmmpi97.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/pvmmpi97.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Fagg:1997:HMAb,
  author =       "G. E. Fagg and J. J. Dongarra and A. Geist",
  title =        "Heterogeneous {MPI} Application Interoperation and
                 Process Management under {PVMPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "91--98",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fang:1997:MDD,
  author =       "Niandong Fang and Helmar Burkhart",
  title =        "{MPI-DDL}: a distributed-data library for {MPI}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "12",
  number =       "5",
  pages =        "407--419",
  day =          "1",
  month =        apr,
  year =         "1997",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 12:41:16 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/0167739X;
                 http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/19/19/27/17/23/abstract.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@TechReport{Fin:1997:CPM,
  author =       "Torsten Fin",
  title =        "Comparing the performance of {MPI}, {PVM}, and {CORBA}
                 on {Ethernet LANs}",
  type =         "{Berichte zur Rechnerarchitektur}",
  number =       "3(4)",
  institution =  "Institut f{\"u}r Informatik, Lehrstuhl f{\"u}r
                 Rechnerarchitektur und -kommunikation,
                 Friedrich-Schiller-Universit{\"a}t Jena",
  address =      "Jena, Germany",
  pages =        "12",
  year =         "1997",
  bibdate =      "Wed Aug 27 06:51:17 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Fischer:1997:AAP,
  author =       "Markus Fischer and Jack Dongarra",
  editor =       "????",
  booktitle =    "{Concurrent Computing Conference, Atlanta, GA, March
                 10--11, 1994}",
  title =        "Another Architecture: {PVM} on {Windows 95\slash NT}",
  publisher =    "????",
  address =      "????",
  pages =        "??--??",
  year =         "1997",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/nt-paper.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/nt-paper.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Fischer:1997:ESP,
  author =       "M. Fischer and J. Simon",
  title =        "Embedding {SCI} into {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "177--184",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Foster:1997:MMC,
  author =       "Ian Foster and Jonathan Geisler and Carl Kesselman and
                 Steven Tuecke",
  title =        "Managing Multiple Communication Methods in
                 High-Performance Networked Computing Systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "1",
  pages =        "35--48",
  day =          "10",
  month =        jan,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1266",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/ref",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C5640
                 (Protocols); C5670 (Network performance)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "Argonne MPICH library; computer networks; computing
                 systems; criteria; heterogeneous networked environment;
                 high-performance networked; message passing; message
                 passing interface; multimethod communication; multiple
                 communication methods; multithreaded runtime system;
                 networked computing environments; Nexus; Nexus-based
                 MPI implementation; performance characteristics;
                 performance evaluation; protocols; remote service
                 request mechanisms; transport mechanisms;
                 user-specified selection",
  treatment =    "P Practical",
}

@TechReport{Gabriel:1997:EMU,
  author =       "Edgar Gabriel and Thomas Beisel and Michael Resch",
  title =        "{Erweiterung einer MPI-Umgebung zur
                 Interoperabilit{\"a}t verteilter MPP-Systeme}.
                 ({German}) [{Extension} of an {MPI} environment for
                 interoperability with distributed {MPI} systems]",
  type =         "{Studienarbeit angewandte Informatik}",
  number =       "RUS 37",
  institution =  "Rechenzentrum Universit{\"a}t Stuttgart",
  address =      "Stuttgart, Germany",
  year =         "1997",
  bibdate =      "Wed Aug 27 06:55:46 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  language =     "German",
}

@Article{Galaktionov:1997:MST,
  author =       "A. S. Galaktionov and P. D. Anderson and G. W. M.
                 Peters",
  title =        "Mixing Simulations: Tracking Strongly Deforming Fluid
                 Volumes in {3D} Flows",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "436--469",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Galibert:1997:YCL,
  author =       "O. Galibert",
  title =        "{YLC}, {A C++ Linda} System on Top of {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "99--106",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{GarciaSalcines:1997:PRR,
  author =       "E. {Garcia Salcines} and G. {Cerruela Garcia} and J.
                 I. {Benavides Benitez} and F. {Mu{\~n}oz Garcia}",
  title =        "Parallel Rendering of Radiance on Distributed Memory
                 System by {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "502--507",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Geist:1997:ACP,
  author =       "G. A. Geist",
  title =        "Advanced Capabilities in {PVM 3.4}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "107--115",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Geist:1997:BPW,
  author =       "G. A. Geist and J. A. Kohl and P. M. Papadopoulos and
                 S. L. Scott",
  title =        "Beyond {PVM 3.4}: What We've Learned, What's Next, and
                 Why",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "116--126",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Geist:1997:CPF,
  author =       "G. A. {Geist, II} and James Arthur Kohl and Philip M.
                 Papadopoulos",
  title =        "{CUMULVS}: Providing Fault Tolerance, Visualization,
                 and Steering of Parallel Applications",
  journal =      j-IJSAHPC,
  volume =       "11",
  number =       "3",
  pages =        "224--235",
  month =        "Fall",
  year =         "1997",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Wed Jul 23 11:38:50 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
}

@Article{Gerlach:1997:ECS,
  author =       "J. Gerlach and M. Sato and Y. Ishikawa",
  title =        "Experiences with the {C++} Standard Template Library
                 and {MPI} for a Parallel Particle Simulation Method",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1225",
  pages =        "961--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Aug 22 11:59:49 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gillett:1997:UMC,
  author =       "Richard Gillett and Richard Kaufmann",
  title =        "Using the {Memory Channel Network} --- Using a cluster
                 of standard {PCI-based} servers with a low-cost network
                 to improve communication performance",
  journal =      j-IEEE-MICRO,
  volume =       "17",
  number =       "1",
  pages =        "19--25",
  month =        jan # "\slash " # feb,
  year =         "1997",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/40.566189",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Mon Apr 7 14:39:59 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Digital Equipment Corp",
  affiliationaddress = "MA, USA",
  classcodes =   "C5610N (Network interfaces); C5620 (Computer networks
                 and techniques)",
  classification = "716.1; 722.1; 722.3; 722.4; 723.1; 723.2",
  corpsource =   "Digital Equip. Corp., USA",
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
  journalabr =   "IEEE Micro",
  keywords =     "Bandwidth; clusters; Coding errors; Communication
                 channels (information theory); Computer networks;
                 computer networks; Computer software; Data
                 communication systems; Data handling; Data storage
                 equipment; Data transfer; DEC computers; Digital;
                 Latency; Lock acquisition; Lock release; Memory
                 channel; Memory Channel; Memory Channel Network;
                 Message passing; Message size; message-passing; network
                 for; network interfaces; Parallel processing systems;
                 PCI bus; Performance; Raw message passing; Storage
                 allocation (computer); Universal message passing;
                 UNIX",
  treatment =    "P Practical",
}

@Article{Goumopoulos:1997:PCS,
  author =       "C. Goumopoulos and E. Housos and O. Liljenzin",
  title =        "Parallel Crew Scheduling on Workstation Networks Using
                 {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "470--477",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Grabowsky:1997:MBK,
  author =       "Lothar Grabowsky",
  title =        "{MPI-basierte Koppelrandkommunikation und
                 Einfl{\"u}{\ss} der Partitionierung im 3D-Fall}.
                 ({German}) [{MPI}-based coupled edge communication and
                 influence of partitioning in {3D-Fall}]",
  type =         "Preprint-Reihe des Chemnitzer SFB 393",
  number =       "97,17",
  institution =  "Universit{\"a}t Chemnitz-Zwickau",
  address =      "Chemnitz, Germany",
  pages =        "13",
  year =         "1997",
  bibdate =      "Wed Aug 27 06:53:21 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  language =     "German",
}

@Article{Grecki:1997:MPE,
  author =       "M. Grecki and G. Jablonski and A. Napieralski",
  title =        "{MOPS} --- Parallel Environment for Simulation of
                 Electronic Circuits Using Physical Models of
                 Semiconductor Devices",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "478--485",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:1997:HPM,
  author =       "William Gropp and Ewing Lusk",
  title =        "A high-performance {MPI} implementation on a
                 shared-memory vector supercomputer",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "11",
  pages =        "1513--1526",
  day =          "26",
  month =        jan,
  year =         "1997",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:14:43 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1997&volume=22&issue=11;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1997&volume=22&issue=11&aid=1113",
  acknowledgement = ack-nhfb,
  affiliation =  "Argonne Natl Lab",
  affiliationaddress = "IL",
  classification = "722.1; 722.2; 722.4; 921.1; C5220P (Parallel
                 architecture); C5610N (Network interfaces); C6150N
                 (Distributed systems software)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "Data storage equipment; Interfaces (computer); message
                 passing; Message passing interface (mpi);
                 Message-Passing Interface; MPI implementation; MPIC;
                 NEC SX-4; network interfaces; parallel; Parallel
                 processing systems; Shared memory multiprocessors;
                 shared memory systems; shared-memory programming;
                 shared-memory vector supercomputer; standards;
                 supercomputer; Supercomputers; Vectors",
  treatment =    "P Practical",
}

@Article{Gropp:1997:SMC,
  author =       "W. Gropp and E. Lusk",
  title =        "Sowing {MPICH}: a Case Study in the Dissemination of a
                 Portable Environment for Parallel Scientific
                 Computing",
  journal =      j-IJSAHPC,
  volume =       "11",
  number =       "2",
  pages =        "103--114",
  month =        "Summer",
  year =         "1997",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Thu Jun 26 18:17:48 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
}

@Article{Gropp:1997:WPM,
  author =       "W. Gropp and E. Lusk",
  title =        "Why Are {PVM} and {MPI} So Different?",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "3--10",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Guan:1997:PDI,
  author =       "Huiwei Guan and Chi-kwong Li and To-yat Cheung and
                 Songnian Yu",
  title =        "Parallel design and implementation of {SOM} neural
                 computing model in {PVM} environment of a distributed
                 system",
  crossref =     "IEEE:1997:APD",
  pages =        "26--31",
  year =         "1997",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C1230D (Neural nets); C5220P (Parallel
                 architecture); C5290 (Neural computing techniques)",
  conflocation = "Shanghai, China; 19-21 March 1997",
  conftitle =    "Proceedings. Advances in Parallel and Distributed
                 Computing",
  corpsource =   "Dept. of Comput. Sci., City Univ. of Hong Kong, Hong
                 Kong",
  keywords =     "architectures; distributed; machines; message passing;
                 neural net architecture; parallel; parallel virtual
                 machine; PVM environment; self-organising feature maps;
                 SOM neural computing model; system; virtual",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Hempel:1997:IMN,
  author =       "R. Hempel and H. Ritzdorf and F. Zimmermann",
  title =        "Implementation of {MPI} on {NEC}'s {SX-4} Multi-Node
                 Architecture",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "185--193",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hoyos-Rivera:1997:UPB,
  author =       "G. J. Hoyos-Rivera and V. G. Sanchez-Arias",
  title =        "Using {PVM} to Build an Interface to Support
                 Cooperative Work in a Distributed Systems Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "127--134",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Hwang:1997:EMC,
  author =       "Kai Hwang and Choming Wang and Cho-Li Wang",
  title =        "Evaluating {MPI} collective communication on the
                 {SP2}, {T3D}, and {Paragon} multicomputers",
  crossref =     "IEEE:1997:TIS",
  pages =        "106--115",
  year =         "1997",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing)",
  conftitle =    "Proceedings Third International Symposium on
                 High-Performance Computer Architecture",
  corpsource =   "Hong Kong Univ., Hong Kong",
  keywords =     "architectural support; closed-form expressions; Cray
                 T3D; IBM SP2; Intel Paragon; message passing; MPI
                 collective communication evaluation; multiprocessing
                 systems; Paragon multicomputers; performance
                 evaluation; STAP benchmark experiments; startup
                 latency; synchronisation; timing; timing performance",
  sponsororg =   "IEEE Computer. Soc. Tech. Committee on Comput.
                 Archit",
  treatment =    "P Practical",
}

@Article{Jabbarzadeh:1997:PSS,
  author =       "A. Jabbarzadeh and J. D. Atkinson and R. I. Tanner",
  title =        "Parallel simulation of shear flow of polymers between
                 structured walls by molecular dynamics simulation on
                 {PVM}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "107",
  number =       "1--3",
  pages =        "123--136",
  month =        dec,
  year =         "1997",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(97)00088-X",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 21:30:21 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S001046559700088X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Jackson:1997:SYE,
  author =       "D. J. Jackson and C. W. Humphres",
  title =        "A simple yet effective load balancing extension to the
                 {PVM} software system",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "12",
  pages =        "1647--1660",
  day =          "21",
  month =        feb,
  year =         "1997",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1997&volume=22&issue=12;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1997&volume=22&issue=12&aid=1112",
  acknowledgement = ack-nhfb,
  classification = "C4140 (Linear algebra); C5260B (Computer vision and
                 image processing techniques); C6110P (Parallel
                 programming); C6150E (General utility programs); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Electr. Eng., Alabama Univ., Tuscaloosa, AL,
                 USA",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "algorithms; application program interfaces; coding;
                 data compression; host CPU load information; image;
                 information gathering; load; load balancing; load
                 balancing extension; master process; matrix algebra;
                 matrix oriented; NAS parallel benchmarks; parallel;
                 parallel algorithms; parallel fractal image compression
                 algorithm; parallel runtime performance; process spawn;
                 processes; programming; programming interface; PVM
                 software system; resource allocation; slave",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@Article{Kacsuk:1997:GDD,
  author =       "Peter Kacsuk and Jose C. Cunha and Gabor Dozsa and
                 Joao Lourenco and Tibor Fadgyas and Tiago Antao",
  title =        "A graphical development and debugging environment for
                 parallel programs",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "13",
  pages =        "1747--1770",
  day =          "28",
  month =        feb,
  year =         "1997",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Oct 21 15:14:48 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1997&volume=22&issue=13;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1997&volume=22&issue=13&aid=1126",
  acknowledgement = ack-nhfb,
  affiliation =  "Hungarian Acad of Sciences",
  affiliationaddress = "Budapest, Hung",
  classification = "722.2; 722.4; 723.1; 723.1.1; 723.5; C6110P
                 (Parallel programming); C6110V (Visual programming);
                 C6115 (Programming support); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6180G
                 (Graphical user interfaces)",
  conference =   "Proceedings of the 1996 Workshop on Distributed and
                 Parallel Systems, DAPSYS",
  corpsource =   "KFKI-MSZKI Res. Inst. for Meas. and Comput. Tech.,
                 Hungarian Acad. of Sci., Budapest, Hungary",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "abstraction mechanism; complex programming
                 environment; Computer aided software engineering;
                 Computer programming; Computer programming languages;
                 data visualization; debugging; debugging engine;
                 debugging environment; distributed; Distributed
                 computer systems; Distributed debugging engine;
                 distributed memory computer architectures; GRADE;
                 graphical; graphical development; graphical user
                 interface; Graphical user interfaces; graphical user
                 interfaces; GRAPNEL; high-level graphical support;
                 language; languages; machine; message-; parallel;
                 Parallel processing systems; parallel programming;
                 Parallel programs; parallel virtual; Parallel virtual
                 machine; passing parallel programs; performance
                 monitoring; program; Program debugging; programming
                 environments; programs; PROVE; Software Package grade;
                 Software Package grapnel; software tools; Tape/PVM;
                 visual",
  meetingaddress = "Miskolc, Hung",
  meetingdate =  "Oct 1996",
  meetingdate2 = "10/96",
  treatment =    "A Application; P Practical",
}

@Article{Kitowski:1997:CPM,
  author =       "J. Kitowski and K. Boryczko and J. Moscinski",
  title =        "Comparison of {PVM} and {MPI} Performance in
                 Short-Range Molecular Dynamics Simulation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "11--16",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Konuru:1997:MUL,
  author =       "Ravi B. Konuru and Steve W. Otto and Jonathan
                 Walpole",
  title =        "A Migratable User-Level Process Package for {PVM}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "1",
  pages =        "81--102",
  day =          "10",
  month =        jan,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1270",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1270/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1270/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1270/production/ref",
  acknowledgement = ack-nhfb,
  classification = "C4240C (Computational complexity); C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6115 (Programming support); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems); C7430 (Computer engineering)",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "adaptive load distribution; application debugging;
                 application transparency; application-transparent
                 migration; availability; based programming model;
                 computational complexity; computing; distributed memory
                 multiprocessor; dynamic environment; dynamic
                 multiprocessor environment; machine; machines; message
                 passing; message-; microbenchmarks; migratable
                 user-level process package; parallel; parallel
                 programming; parallel virtual; program debugging; PVM;
                 system load; unobtrusive; unpredictable variability;
                 user-level process; virtual; virtual processor;
                 workstation; workstation networks; workstation
                 ownership",
  treatment =    "A Application; P Practical",
}

@Article{Kormicki:1997:PLS,
  author =       "Maciek Kormicki and Ausif Mahmood and Bradley S.
                 Carlson",
  title =        "Parallel logic simulation on a network of workstations
                 using parallel virtual machine",
  journal =      j-TODAES,
  volume =       "2",
  number =       "2",
  pages =        "123--134",
  month =        jan,
  year =         "1997",
  CODEN =        "ATASFO",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Fri Jul 27 10:05:33 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/todaes/1997-2-2/p123-kormicki/p123-kormicki.pdf;
                 http://www.acm.org/pubs/citations/journals/todaes/1997-2-2/p123-kormicki/",
  abstract =     "This paper explores parallel logic simulation on a
                 network of workstations using a parallel virtual
                 machine (PVM). A novel parallel implementation of the
                 centralized-time event-driven logic simulation
                 algorithm is carried out such that no global
                 controlling workstation is needed to synchronize the
                 advance of simulation time. Further advantages of our
                 new approach include a random partitioning of the
                 circuit onto available workstations and a pipelined
                 execution of the different phases of the simulation
                 algorithm. To achieve a better load balance, we employ
                 a semioptimistic scheme for gate evaluations (in
                 conjunction with a centralized-time algorithm) such
                 that no rollback is required. The performance of this
                 implementation has been evaluated using the ISCAS
                 benchmark circuits. Speedups improve with the size of
                 the circuit and the activity level in the circuit.
                 Analyses of the communication overhead show that the
                 techniques developed here will yield even higher gains
                 as newer networking technologies like ATM are employed
                 to connect workstations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems (TODAES)",
  generalterms = "Algorithms; Performance; Verification",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
  keywords =     "distributed computing; parallel logic simulation; PVM;
                 synchronous simulation",
  subject =      "Hardware --- Logic Design --- Design Aids (B.6.3):
                 {\bf Simulation}; Hardware --- Integrated Circuits ---
                 Design Aids (B.7.2): {\bf Simulation}",
}

@Article{Krantz:1997:CSC,
  author =       "A. T. Krantz and V. S. Sunderam",
  title =        "Client Server Computing on Message Passing Systems:
                 Experiences with {PVM-RPC}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "110--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Krotz-Vogel:1997:PPP,
  author =       "W. Krotz-Vogel and H.-C. Hoppe",
  title =        "The {PALLAS} Parallel Programming Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "257--266",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lauria:1997:MFH,
  author =       "Mario Lauria and Andrew Chien",
  title =        "{MPI-FM}: High Performance {MPI} on Workstation
                 Clusters",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "1",
  pages =        "4--18",
  day =          "10",
  month =        jan,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1264",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1264/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1264/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1264/production/ref",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C5620L (Local
                 area networks); C5640 (Protocols); C5670 (Network
                 performance)",
  corpsource =   "Dipartimento di Inf. e Sistemistica, Naples Univ.,
                 Italy",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "application level; bandwidth; communication layers;
                 communication performance; communication pipeline
                 stages; communication software; Cray; Cray T3D; design
                 solutions; evaluation; fast messages library; hardware
                 performance; high; high level messaging library; high
                 performance MPI; high speed LANs; IBM SP2; interface;
                 latency; level messaging layer; local area networks;
                 low; low level; low level communication layers; low
                 level messaging layer; message passing; message passing
                 interface; minimum; minimum one-way latency; MPI-FM;
                 Myrinet network; one-way latency; performance;
                 performance evaluation; protocols; SPARCstation 20
                 workstations; speed LANs; T3D; workstation clusters;
                 workstations",
  treatment =    "A Application; P Practical",
}

@InProceedings{Li:1997:EHC,
  author =       "Konming Gary Li and Nabil M. Zamel",
  title =        "An Evaluation of {HPF} Compilers and the
                 Implementation of a Parallel Linear Equation Solver
                 Using {HPF} and {MPI}",
  crossref =     "ACM:1997:SHP",
  pages =        "??--??",
  year =         "1997",
  bibdate =      "Sat Mar 21 08:51:09 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.supercomp.org/sc97/proceedings/TECH/LI/INDEX.HTM",
  acknowledgement = ack-nhfb,
}

@Article{Li:1997:PIO,
  author =       "Wei Li and Xiaohu Huang and Nanning Zheng",
  title =        "Parallel implementing {OpenGL} on {PVM}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "23",
  number =       "12",
  pages =        "1839--1850",
  day =          "15",
  month =        dec,
  year =         "1997",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:15:16 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1997&volume=23&issue=12;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1997&volume=23&issue=12&aid=1248",
  acknowledgement = ack-nhfb,
  affiliation =  "Xi'an Jiaotong Univ",
  affiliationaddress = "Xi'an, China",
  classification = "722.4; 723.2",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "Algorithms; Data decomposition; Image processing; Load
                 balancing; Parallel processing systems; Parallel
                 virtual machine; Task granularity; Three dimensional;
                 Virtual reality",
}

@Article{Lu:1997:QPD,
  author =       "Honghui Lu and Sandhya Dwarkadas and Alan L. Cox and
                 Willy Zwaenepoel",
  title =        "Quantifying the Performance Differences between {PVM}
                 and {TreadMarks}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "43",
  number =       "2",
  pages =        "65--78",
  day =          "15",
  month =        jun,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1997.1332",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:03 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1332/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1332/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1332/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Ludwig:1997:OUI,
  author =       "T. Ludwig and R. Wismueller",
  title =        "{OMIS 2.0} --- a Universal Interface for Monitoring
                 Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "267--276",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Luecke:1997:HPF,
  author =       "G. R. Luecke and J. J. Coyle",
  title =        "{High Performance Fortran} versus explicit message
                 passing on the {IBM SP-2} for the parallel {LU}, {QR},
                 and {Cholesky} factorizations",
  journal =      j-SUPERCOMPUTER,
  volume =       "13",
  number =       "2",
  pages =        "4--14",
  month =        "????",
  year =         "1997",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Wed Mar 18 08:37:01 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C4140 (Linear algebra); C5440 (Multiprocessing
                 systems); C6110P (Parallel programming); C6150N
                 (Distributed systems software); C7310 (Mathematics
                 computing)",
  corpsource =   "Iowa State Univ., Ames, IA, USA",
  fjournal =     "Supercomputer",
  keywords =     "BLACS; BLAS; Cholesky factorizations; ESSL library;
                 explicit message passing; FORTRAN; High Performance
                 Fortran; high-performance parallel implementations; IBM
                 computers; IBM SP- 2; LU factorizations; mathematics
                 computing; matrix decomposition; message passing; MPI
                 version; parallel computer; parallel languages;
                 parallel machines; parallel programming; QR
                 factorizations; SCALAPACK; software development;
                 software libraries; software maintenance; software
                 performance evaluation; Visual Numerics",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@Article{Manegold:1997:QBM,
  author =       "S. Manegold and F. Waas and D. Gudlat",
  title =        "In Quest of the Bottleneck --- Monitoring Parallel
                 Database Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "277--284",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mazzariol:1997:PCS,
  author =       "M. Mazzariol and B. A. Gennart and V. Messerli and R.
                 D. Hersch",
  title =        "Performance of {CAP}-Specified Linear Algebra
                 Algorithms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "351--358",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{McDonald:1997:IPT,
  author =       "Chris McDonald and Kamran Kazemi",
  title =        "Improving the {PVM} teaching environment",
  journal =      j-SIGCSE,
  volume =       "29",
  number =       "1",
  pages =        "219--223",
  month =        mar,
  year =         "1997",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/268085.268167",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 18:57:38 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigcse1990.bib",
  abstract =     "The parallel programming community has long recognized
                 the need for a simple programming environment offering
                 interprocess communication between heterogeneous
                 systems. As the Parallel Virtual Machine environment,
                 PVM, has emerged to meet this goal, an increasing
                 number of educational institutions are choosing PVM to
                 support their teaching of parallel and distributed
                 computing using networks of workstations. However, it
                 is often the nature of PVM's design and implementation
                 that can severely limit its success in a teaching
                 environment. This paper first motivates and then
                 describes improvements to the PVM environment which
                 increase both robustness and efficiency in an
                 educational setting.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Mintchev:1997:TPM,
  author =       "S. Mintchev and V. Getov",
  title =        "Towards Portable Message Passing in {Java}: Binding
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "135--142",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mysliwiec:1997:CAM,
  author =       "G. Mysliwiec and J. Sipowicz and R. Schaefer",
  title =        "Control Activities in Message Passing Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "143--150",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mysliwiec:1997:IPS,
  author =       "G. Mysliwiec and J. Sipowicz and H. Burkhart",
  title =        "Implementing Parallel {SBS}-Type Linear Solvers Using
                 {ALWAN}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "359--366",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Overeinder:1997:BCD,
  author =       "B. J. Overeinder and P. M. A. Sloot",
  title =        "Breaking the Curse of Dynamics by Task Migration:
                 Pilot Experiments in the {Polder Metacomputer}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "194--207",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Pacheco:1997:PPM,
  author =       "Peter S. Pacheco",
  title =        "Parallel programming with {MPI}",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xxii + 418",
  year =         "1997",
  ISBN =         "1-55860-339-5",
  ISBN-13 =      "978-1-55860-339-4",
  LCCN =         "QA76.642 .P3 1997",
  bibdate =      "Fri Feb 04 17:32:19 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Peinado:1997:HPC,
  author =       "M. Peinado and R. Venkatesan",
  title =        "Highly Parallel Cryptographic Attacks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "367--374",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pernice:1997:BRM,
  author =       "Michael Pernice",
  title =        "Book Review: {{\em MPI: The Complete Reference}}",
  journal =      j-IEEE-CONCURR,
  volume =       "5",
  number =       "1",
  pages =        "80--81",
  month =        jan # "\slash " # mar,
  year =         "1997",
  CODEN =        "IECMFX",
  DOI =          "https://doi.org/10.1109/MCC.1997.580453",
  ISSN =         "1092-3063 (print), 1558-0849 (electronic)",
  ISSN-L =       "1092-3063",
  bibdate =      "Tue Jan 16 06:49:26 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/pd/books/pd1997/pdf/p1080.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Concurrency",
}

@Article{Petcu:1997:ISM,
  author =       "D. Petcu",
  title =        "Implementation of Some Multiprocessor Algorithms for
                 {ODEs} Using {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "375--382",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Piernas:1997:APM,
  author =       "J. Piernas and A. Flores and J. M. Garcia",
  title =        "Analyzing the Performance of {MPI} in a Cluster of
                 Workstations Based on {Fast Ethernet}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "17--24",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Reinhard:1997:MHP,
  author =       "E. Reinhard and A. Chalmers",
  title =        "Message Handling in Parallel Radiance",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "486--493",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Resch:1997:CMP,
  author =       "M. Resch and H. Berger and T. Boenisch",
  title =        "A Comparison of {MPI} Performance on Different
                 {MPPs}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "25--32",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Resch:1997:PM,
  author =       "Michael Resch and Thomas Beisel and Holger Berger",
  title =        "{PACX-MPI}",
  type =         "{BI: Informationen f{\"u}r Nutzer des
                 Rechenzentrums}",
  number =       "1997,11/12",
  institution =  "Universit{\"a}t Stuttgart, Zentrale
                 Universit{\"a}tseinrichtung",
  address =      "Stuttgart, Germany",
  year =         "1997",
  bibdate =      "Wed Aug 27 07:18:18 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@TechReport{Resch:1997:PMC,
  author =       "Michael Resch and Holger Berger and Thomas
                 B{\"o}nisch",
  title =        "Performance of {MPI} on a {Cray T3E-512}",
  type =         "{BI: Informationen f{\"u}r Nutzer des
                 Rechenzentrums}",
  number =       "1997,5/6",
  institution =  "Universit{\"a}t Stuttgart, Zentrale
                 Universit{\"a}tseinrichtung",
  address =      "Stuttgart, Germany",
  pages =        "??",
  year =         "1997",
  bibdate =      "Wed Aug 27 07:14:37 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Third European CRAY-SGI MPP Workshop.",
  acknowledgement = ack-nhfb,
}

@Article{Roda:1997:PPI,
  author =       "J. L. Roda and C. Rodriguez and F. Almeida and D.
                 Gonzalez-Morales",
  title =        "Predicting the Performance of Injection Communication
                 Patterns on {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "33--40",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Rough:1997:PRD,
  author =       "J. Rough and A. Goscinski and D. {De Paoli}",
  title =        "{PVM} on the {RHODOS} Distributed Operating System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "208--218",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Roy:1997:PNT,
  author =       "R. Roy and Z. Stankovski",
  title =        "Parallelization of Neutron Transport Solvers",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "494--501",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Santos:1997:ECP,
  author =       "L. P. Santos and V. Castro and A. Proenca",
  title =        "Evaluation of the Communication Performance on a
                 Parallel Processing System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "41--48",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Saphir:1997:SMI,
  author =       "William Saphir",
  title =        "A Survey of {MPI} Implementations",
  journal =      "NHSE Review",
  volume =       "2",
  number =       "1",
  pages =        "??--??",
  month =        nov,
  year =         "1997",
  bibdate =      "Wed Jan 14 05:59:12 2004",
  bibsource =    "http://www.crpc.rice.edu/NHSEreview/96-1.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "National HPCC Software Exchange (NHSE); Rice
                 University",
}

@Article{Serot:1997:EPF,
  author =       "J. Serot",
  title =        "Embodying Parallel Functional Skeletons: An
                 Experimental Implementation on Top of {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "629--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Silva:1997:IPD,
  author =       "Luis M. Silva and Joao Gabriel Silva and Simon
                 Chapple",
  title =        "Implementation and Performance of {DSMPI}",
  journal =      j-SCI-PROG,
  volume =       "6",
  number =       "2",
  pages =        "201--214",
  month =        "Summer",
  year =         "1997",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 ftp://ftp.ira.uka.de/bibliography/Parallel/dsm.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
  xxauthor =     "L. M. Silva and S. Chapple and J. G. Silva",
  xxpages =      "210--214",
}

@Article{Soch:1997:PGP,
  author =       "M. Soch and P. Tvrdik and M. Volf",
  title =        "Parallel Graph-Partitioning Using the Mob Heuristic",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "383--389",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Song:1997:ALL,
  author =       "Jianjian Song and Heng Kek Choo and Kuok Ming Lee",
  title =        "Application-level load migration and its
                 implementation on top of {PVM}",
  journal =      j-CPE,
  volume =       "9",
  number =       "1",
  pages =        "1--19",
  month =        jan,
  year =         "1997",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150J (Operating systems); C6150N (Distributed
                 systems software)",
  corpsource =   "Nat. Supercomput. Res. Center, Nat. Univ. of
                 Singapore, Singapore",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "concurrency; load migration; location transparency;
                 network operating systems; operating systems
                 (computers); parallel processing; process migration;
                 PVM; receive buffer; residual dependency; resource
                 allocation; virtual machines",
  pubcountry =   "UK",
  treatment =    "P Practical",
}

@Article{Souza:1997:EPH,
  author =       "P. S. Souza and L. J. Senger and M. J. Santana and R.
                 C. Santana",
  title =        "Evaluating Personal High Performance Computing with
                 {PVM} on {Windows} and {LINUX} Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "49--56",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Squyres:1997:DEM,
  author =       "J. M. Squyres and B. Saphir and A. Lumsdaine",
  title =        "The Design and Evolution of the {MPI-2 C++}
                 Interface",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1343",
  pages =        "57--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Stellner:1997:LBB,
  author =       "G. Stellner and J. Trinitis",
  title =        "Load Balancing Based on Process Migration for {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "150--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Strietzel:1997:PTS,
  author =       "M. Strietzel",
  title =        "Parallel Turbulence Simulation: Resolving the Inertial
                 Subrange of {Kolmogorov}'s Spectra",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "508--516",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sunderam:1997:TAS,
  author =       "V. Sunderam and B. Topol and S. Moyer and A. Krantz",
  title =        "Tools and Auxiliary Subsystems in {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "285--294",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Theodoropoulos:1997:GSP,
  author =       "P. Theodoropoulos and P. Tsanakas and G.
                 Papakonstantinou",
  title =        "Global Semaphores in a Parallel Programming
                 Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "151--158",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Uminski:1997:EEP,
  author =       "P. W. Uminski and M. R. Matuszek and H. Krawczyk",
  title =        "Experimental Evaluation of {PVM} Group Communication",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "57--66",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{vandeGeijn:1997:UPP,
  author =       "Robert A. {van de Geijn}",
  title =        "Using {PLAPACK}: Parallel Linear Algebra Package",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xvii + 194",
  year =         "1997",
  ISBN =         "0-262-72026-4",
  ISBN-13 =      "978-0-262-72026-7",
  LCCN =         "QA185.D37 V36 1997",
  bibdate =      "Fri Dec 19 10:39:21 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "With contributions by Philip Alpatov and others.",
  price =        "US\$27.50",
  acknowledgement = ack-nhfb,
}

@Article{Vlassov:1997:SSM,
  author =       "V. Vlassov and L.- E. Thorelli",
  title =        "A Synchronizing Shared Memory: Model and Programming
                 Implementation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "159--166",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Wang:1997:TPD,
  author =       "Paul S. Wang",
  title =        "Tools for parallel\slash distributed mathematical
                 computation",
  crossref =     "ACM:1997:PPS",
  pages =        "188--195",
  year =         "1997",
  bibdate =      "Tue Sep 28 07:51:05 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Kent State Univ",
  affiliationaddress = "USA",
  classification = "722.3; 722.4; 723; 723.5; 921; 921.1",
  keywords =     "Algebra; Common Lisp; Computational methods; Computer
                 networks; Computer software; Data communication
                 systems; Interfaces (computer); Lisp (programming
                 language); Multi protocol (MP); Multiple instruction
                 multiple data (MIMD) parallel machines; Network
                 protocols; Parallel processing systems; Parallel
                 virtual machines (PVM); Program compilers; Symbolic and
                 algebraic computation (SAC); Virtual reality",
}

@Article{Winstanley:1997:PDP,
  author =       "N. Winstanley and J. O'Donnell",
  title =        "Parallel Distributed Programming with {Haskell+PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "670--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wismueller:1997:DMP,
  author =       "R. Wismueller",
  title =        "Debugging Message Passing Programs Using Invisible
                 Message Tags",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "295--304",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wolf:1997:CMP,
  author =       "K. Wolf and E. Brakkee and D. P. Ho",
  title =        "Communication in Multi-Physics Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "167--176",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Yalamov:1997:BRT,
  author =       "Plamen Y. Yalamov and Svetozar Margenov",
  title =        "Book Reviews: Two books on {MPI}: {{\em Parallel
                 Programming with MPI}}; {{\em MPI: The Complete
                 Reference (2nd printing)}}",
  journal =      j-IEEE-CONCURR,
  volume =       "5",
  number =       "4",
  pages =        "81--81",
  month =        oct # "\slash " # dec,
  year =         "1997",
  CODEN =        "IECMFX",
  DOI =          "https://doi.org/10.1109/MCC.1997.580454",
  ISSN =         "1092-3063 (print), 1558-0849 (electronic)",
  ISSN-L =       "1092-3063",
  bibdate =      "Mon Jun 7 07:52:29 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/pd/books/pd1997/pdf/p4080.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Concurrency",
}

@Article{Zhang:1997:DED,
  author =       "Xiaodong Zhang and Sandra G. Dykes and Hong Deng",
  title =        "Distributed Edge Detection: Issues and
                 Implementations",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "4",
  number =       "1",
  pages =        "72--82",
  month =        jan # "\slash " # mar,
  year =         "1997",
  CODEN =        "ISCEE4",
  DOI =          "https://doi.org/10.1109/99.590860",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1997/pdf/c1072.pdf;
                 http://www.computer.org/cse/cs1998/c1072abs.htm",
  abstract =     "Experiments in parallelizing an edge detection
                 algorithm on three representative message-passing
                 architectures --- a low-cost, heterogeneous PVM
                 network, an Intel {iPSC\slash 860} hypercube, and a
                 {CM-5} massively parallel multicomputer --- provide
                 insight into implementation and performance issues for
                 image-processing applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@Article{Zilli:1997:TBN,
  author =       "G. Zilli and L. Bergamaschi",
  title =        "Truncated Block {Newton} and Quasi-{Newton} Methods
                 for Sparse Systems of Nonlinear Equations. Experiments
                 on Parallel Platforms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1332",
  pages =        "390--400",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Dec 9 06:27:54 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Adamo:1998:MTO,
  author =       "Jean-Marc Adamo",
  title =        "Multi-threaded object-oriented {MPI}-based message
                 passing interface: the {ARCH} library",
  volume =       "SECS 446",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xiv + 185",
  year =         "1998",
  ISBN =         "0-7923-8165-3",
  ISBN-13 =      "978-0-7923-8165-5",
  LCCN =         "TK5102.5.A293 1998",
  bibdate =      "Mon May 17 18:15:19 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "US\$120.00",
  series =       "The Kluwer international series in engineering and
                 computer science",
  acknowledgement = ack-nhfb,
  keywords =     "data transmission systems; object-oriented programming
                 (computer science); threads (computer programs)",
  libnote =      "Not yet in my library.",
}

@Article{Alexandrov:1998:CGP,
  author =       "V. Alexandrov and F. Dehne and A. Rau-Chaplin and K.
                 Taft",
  title =        "Coarse Grained Parallel {Monte Carlo} Algorithms for
                 Solving {SLAE} Using {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "323--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Andersson:1998:PFT,
  author =       "U. Andersson",
  title =        "Parallelization of a {$3$D FD-TD} Code for the
                 {Maxwell} Equations Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1541",
  pages =        "12--19",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Sep 15 10:01:31 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1998b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "applied parallel computing; computing science; PARA;
                 parallel computing",
}

@TechReport{Andre:1998:BVN,
  author =       "Andr{\'e} Fachat and Karl Heinz Hoffmann",
  title =        "Blocking vs. non-blocking communication under {MPI} on
                 a Master-Workerproblem",
  type =         "{Preprint-Reihe des Chemnitzer SFB 393
                 Sonderforschungsbereich NumerischeSimulation auf Massiv
                 Parallelen Rechnern}",
  number =       "98,18",
  institution =  "Universit{\"a}t Chemnitz-Zwickau",
  address =      "Chemnitz, Germany",
  year =         "1998",
  bibdate =      "Wed Aug 27 07:09:52 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Anonymous:1998:ANO,
  author =       "Anonymous",
  title =        "Announcements: New Official {Fortran} Technical
                 Reports; Working Group 5 Documents; {OpenGL} {Fortran
                 95} Bindings; {MPI} Module Provides Enhanced {Fortran}
                 Support; Variable Precision Arithmetic; {Fortran}
                 Information Sites; New {Fortran} Compiler Versions from
                 {Lahey} and {Fujitsu}; Downloadable Advanced {Fortran}
                 Textbook; {Fortran} Engineering Textbook",
  journal =      j-FORTRAN-FORUM,
  volume =       "17",
  number =       "3",
  pages =        "1--2",
  month =        dec,
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1061-7264 (print), 1931-1311 (electronic)",
  ISSN-L =       "1061-7264",
  bibdate =      "Thu Feb 07 13:34:54 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Fortran Forum",
  issue =        "53",
}

@Article{Baker:1998:MNC,
  author =       "M. Baker",
  title =        "{MPI} on {NT}: The Current Status and Performance of
                 the Available Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "63--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Baker:1998:MNP,
  author =       "M. Baker and G. Fox",
  title =        "{MPI} on {NT}: a Preliminary Evaluation of the
                 Available Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1388",
  pages =        "549--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Berthou:1998:PHM,
  author =       "J.-Y. Berthou and L. Plagne",
  title =        "Parallel {HPF-MPI} Implementation of the {TBSCM}
                 {Poisson} Solver",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1401",
  pages =        "252--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Bhanot:1998:DTM,
  author =       "Gyan Bhanot",
  title =        "A $2$-d transpose {MPI} code",
  type =         "Research report",
  number =       "RC 21217",
  institution =  "T. J. Watson Research Center, IBM Corporation",
  address =      "Almaden, CA, USA",
  year =         "1998",
  bibdate =      "Wed Aug 27 07:16:38 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Browne:1998:RPA,
  author =       "Shirley Browne and Jack Dongarra and Kevin London",
  title =        "Review of Performance Analysis Tools for {MPI}
                 Parallel Programs",
  journal =      "NHSE Review",
  volume =       "3",
  year =         "1998",
  CODEN =        "????",
  ISSN =         "????",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Accepted, to appear.",
  URL =          "http://www.cs.utk.edu/~browne/perftools-review/",
  acknowledgement = ack-nhfb,
  keywords =     "National HPCC Software Exchange (NHSE); Rice
                 University",
  remark =       "This journal ceased publication in 1997.",
}

@Article{Bubak:1998:PCL,
  author =       "M. Bubak and P. Luszczek and A. Wierzbowska",
  title =        "Porting {CHAOS} Library to {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "131--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Carissimi:1998:AEM,
  author =       "A. Carissimi and M. Pasin",
  title =        "{Athapascan}: An Experience on Mixing {MPI}
                 Communications and Threads",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "137--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ceron:1998:PID,
  author =       "C. Ceron and J. Dopazo and E. L. Zapata and J. M.
                 Carazo and O. Trelles",
  title =        "Parallel implementation of {DNAml} program on
                 message-passing architectures",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "24",
  number =       "5--6",
  pages =        "701--716",
  day =          "1",
  month =        jun,
  year =         "1998",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun Oct 25 09:30:12 MST 1998",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1998&volume=24&issue=5-6;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cas/tree/store/parco/sub/1998/24/5-6/1279.pdf",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Malaga",
  affiliationaddress = "Malaga, Spain",
  classification = "722; 722.4; 723; 723.2",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "Computer architecture; Computer software; Message
                 passing computer architecture; Natural sciences
                 computing; Parallel algorithms; Parallel processing
                 systems; Parallel virtual machines (PVM)",
}

@Article{Chan:1998:PCT,
  author =       "K. J. Chan and A. M. Gibbons and M. Pias and W.
                 Rytter",
  title =        "On the {PVM} Computations of Transitive Closure and
                 Algebraic Path Problems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "338--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chapman:1998:OHI,
  author =       "B. Chapman and P. Mehrotra",
  title =        "{OpenMP} and {HPF}: Integrating Two Paradigms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1470",
  pages =        "650--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/hpfortran.bib;
                 http://www.math.utah.edu/pub/tex/bib/lncs1998b.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chetlur:1998:ALE,
  author =       "M. Chetlur and G. D. Sharma and N. Abu-Ghazaleh and U.
                 K. V. Rajasekaran",
  title =        "An Active Layer Extension to {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "97--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Clark:1998:FOP,
  author =       "David Clark",
  title =        "Focus: {OpenMP}: a parallel standard for the masses",
  journal =      j-IEEE-CONCURR,
  volume =       "6",
  number =       "1",
  pages =        "10--12",
  month =        jan # "\slash " # mar,
  year =         "1998",
  CODEN =        "IECMFX",
  DOI =          "https://doi.org/10.1109/4434.656771",
  ISSN =         "1092-3063 (print), 1558-0849 (electronic)",
  ISSN-L =       "1092-3063",
  bibdate =      "Tue Jan 16 06:04:49 MST 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeeconcurrency.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/pd/books/pd1998/pdf/p1010.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Concurrency",
}

@Article{Cotronis:1998:DMP,
  author =       "Y. Cotronis",
  title =        "Developing Message-Passing Applications on {MPICH}
                 under Ensemble",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "145--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cunha:1998:MPP,
  author =       "J. C. Cunha and V. Duarte",
  title =        "Monitoring {PVM} Programs Using the {DAMS} Approach",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "273--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dagum:1998:OIS,
  author =       "Leonardo Dagum and Ramesh Menon",
  title =        "{OpenMP}: An Industry-Standard {API} for Shared-Memory
                 Programming",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "5",
  number =       "1",
  pages =        "46--55",
  month =        jan # "\slash " # mar,
  year =         "1998",
  CODEN =        "ISCEE4",
  DOI =          "https://doi.org/10.1109/99.660313",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1998/pdf/c1046.pdf;
                 http://www.computer.org/cse/cs1998/c1046abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@Article{Dantas:1998:ESM,
  author =       "M. A. R. Dantas and E. J. Zaluska",
  title =        "Efficient scheduling of {MPI} applications on networks
                 of workstations",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "13",
  number =       "6",
  pages =        "489--499",
  day =          "20",
  month =        may,
  year =         "1998",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 12:41:17 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/0167739X;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/19/19/28/20/21/abstract.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Delves:1998:HPF,
  author =       "M. Delves and H. Zima",
  title =        "{High Performance Fortran}: a Status Report or: Are We
                 Ready to Give Up {MPI}?",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "161--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dimov:1998:IMC,
  author =       "I. Dimov and V. Alexandrov and A. Karaivanova",
  title =        "Implementation of {Monte Carlo} Algorithms for
                 Eigenvalue Problem Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "346--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Espinosa:1998:ADP,
  author =       "A. Espinosa and T. Margalef and E. Luque",
  title =        "Automatic Detection of {PVM} Program Performance
                 Problems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "19--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fagg:1998:MMH,
  author =       "G. E. Fagg and K. S. London and J. J. Dongarra",
  title =        "{MPIConnect}: Managing Heterogeneous {MPI}
                 Applications Interoperation and Process Control",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "93--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Fang:1998:DDL,
  author =       "Niandong Fang",
  title =        "Distributed data library and tools for an {MPI}
                 programming environment",
  volume =       "1",
  publisher =    "Shaker",
  address =      "Aachen, Germany",
  pages =        "xx + 195",
  year =         "1998",
  ISBN =         "3-8265-4101-4",
  ISBN-13 =      "978-3-8265-4101-8",
  LCCN =         "????",
  bibdate =      "Wed Aug 27 06:49:31 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Also published as dissertation of the University of
                 Basel.",
  series =       "Research reports in computer science",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ferrari:1998:JNPa,
  author =       "Adam J. Ferrari",
  title =        "{JPVM}: Network Parallel Computing in {Java}",
  crossref =     "ACM:1998:AWJ",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Thu Apr 27 10:43:08 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.cs.ucsb.edu/conferences/java98/papers/jpvm.pdf;
                 http://www.cs.ucsb.edu/conferences/java98/papers/jpvm.ps",
  acknowledgement = ack-nhfb,
}

@Article{Ferrari:1998:JNPb,
  author =       "Adam Ferrari",
  title =        "{JPVM}: network parallel computing in {Java}",
  journal =      j-CPE,
  volume =       "10",
  number =       "11--13",
  pages =        "985--992",
  month =        sep,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:44 MDT 1999",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  note =         "Special Issue: Java for High-performance Network
                 Computing.",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=10050413;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=10050413&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Ferrari:1998:MDC,
  author =       "Adam Ferrari and V. S. Sunderam",
  title =        "Multiparadigm distributed computing with {TPVM}",
  journal =      j-CPE,
  volume =       "10",
  number =       "3",
  pages =        "199--228",
  month =        mar,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:39 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5374;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=5374&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Folino:1998:EMC,
  author =       "G. Folino and G. Spezzano and D. Talia",
  title =        "Evaluating and Modeling Communication Overhead of
                 {MPI} Primitives on the {Meiko CS-2}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "27--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Folino:1998:PEM,
  author =       "G. Folino and G. Spezzano and D. Talia",
  title =        "Performance Evaluation and Modelling of {MPI}
                 Communications on the {Meiko CS-2}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1401",
  pages =        "932--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Foster:1998:GEM,
  author =       "Ian Foster",
  title =        "A Grid-Enabled {MPI}: Message Passing in Heterogeneous
                 Distributed Computing Systems",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Oct 07 08:50:26 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}

@Article{Foster:1998:WAI,
  author =       "Ian Foster and Jonathan Geisler and William Gropp and
                 Nicholas Karonis and Ewing Lusk and George
                 Thiruvathukal and Steven Tuecke",
  title =        "Wide-area implementation of the {Message Passing
                 Interface}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "24",
  number =       "12--13",
  pages =        "1735--1749",
  day =          "1",
  month =        nov,
  year =         "1998",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:15:40 MDT 1999",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1998&volume=24&issue=12-13;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cas/tree/store/parco/sub/1998/24/12-13/1352.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Fuerle:1998:IPC,
  author =       "T. Fuerle and E. Schikuta and C. Loeffelhardt and K.
                 Stockinger",
  title =        "On the Implementation of a Portable, Client-Server
                 Based {MPI-IO} Interface",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "172--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Garcia-Consuegra:1998:DGR,
  author =       "J. D. Garcia-Consuegra and J. A. Gallud and G.
                 Sebastian",
  title =        "Distributed Georeferring of Remotely Sensed
                 {Landsat-TM} Imagery Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1541",
  pages =        "161--166",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Sep 15 10:01:31 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1998b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "applied parallel computing; computing science; PARA;
                 parallel computing",
}

@Article{Geist:1998:HNG,
  author =       "G. A. Geist",
  title =        "{Harness}: The Next Generation Beyond {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "74--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gorlatch:1998:GMI,
  author =       "Sergei Gorlatch and Holger Bischof",
  title =        "A Generic {MPI} Implementation for a Data-Parallel
                 Skeleton: Formal Derivation and Application to {FFT}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "8",
  number =       "4",
  pages =        "447--??",
  month =        dec,
  year =         "1998",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Jan 6 12:02:34 MST 2005",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Goujon:1998:AAT,
  author =       "D. S. Goujon and M. Michel and J. Peeters and J. E.
                 Devaney",
  title =        "{AutoMap} and {AutoLink}: Tools for Communicating
                 Complex and Dynamic Data-Structures Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1362",
  pages =        "98--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Grabowsky:1998:NMP,
  author =       "Lothar Grabowsky and Thomas Ermer and J{\"o}rg
                 Werner",
  title =        "{Nutzung von MPI f{\"u}r parallele FEM-Systeme}.
                 ({German}) [{Use} of {MPI} for parallel {FEM}
                 systems]",
  type =         "{Preprint-Reihe des Chemnitzer SFB 393
                 Sonderforschungsbereich NumerischeSimulation auf Massiv
                 Parallelen Rechnern }",
  number =       "97,08; RA-TR 02-97",
  institution =  "Universit{\"a}t Chemnitz-Zwickau",
  address =      "Chemnitz, Germany",
  year =         "1998",
  bibdate =      "Wed Aug 27 07:11:28 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  language =     "German",
}

@Book{Gropp:1998:MCR,
  author =       "William Gropp and Steven Huss-Lederman and Andrew
                 Lumsdaine and Ewing Lusk and Bill Nitzberg and William
                 Saphir and Marc Snir",
  title =        "{MPI}: The Complete Reference. Volume 2, The {MPI-2}
                 Extensions",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  edition =      "Second",
  pages =        "350",
  year =         "1998",
  ISBN =         "0-262-57123-4 (vol. 2), 0-262-69216-3 (set)",
  ISBN-13 =      "978-0-262-57123-4 (vol. 2), 978-0-262-69216-8 (set)",
  LCCN =         "QA76.642 .M65 1998",
  bibdate =      "Thu Oct 29 07:27:43 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See also volume 1 \cite{Snir:1998:MCR}.",
  price =        "US\$30 (paperback)",
  series =       "Scientific and Engineering Computation",
  URL =          "http://mitpress.mit.edu/book-home.tcl?isbn=0262571234",
  acknowledgement = ack-nhfb,
}

@Article{Haimes:1998:UPM,
  author =       "R. Haimes and K. E. Jordan",
  title =        "Using {PVM} and {MPI} for Co-processed, Distributed
                 and Parallel Scientific Visualization",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1388",
  pages =        "1098--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hansen:1998:EMP,
  author =       "Per Brinch Hansen",
  title =        "An Evaluation of the {Message-Passing Interface}",
  journal =      j-SIGPLAN,
  volume =       "33",
  number =       "3",
  pages =        "65--72",
  month =        mar,
  year =         "1998",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 30 08:30:23 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "The author criticizes MPI, and remarks ``MPI \ldots{}
                 lack[s] the elegance and security that can only by
                 checked by a parallel programming language.''",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Hatazaki:1998:RRS,
  author =       "T. Hatazaki",
  title =        "Rank Reordering Strategy for {MPI} Topology Creation
                 Functions",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "188--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Herland:1998:CML,
  author =       "B. G. Herland and M. Eberl and H. Hellwagner",
  title =        "A Common Messaging Layer for {MPI} and {PVM} over
                 {SCI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1401",
  pages =        "576--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Husbands:1998:MSD,
  author =       "Parry J. Husbands",
  title =        "{MPI-StarT}: Delivering Network Performance to
                 Numerical Applications",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Oct 07 08:50:26 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}

@Article{Karlsson:1998:CCC,
  author =       "S. Karlsson and M. Brorsson",
  title =        "A Comparative Characterization of Communication
                 Patterns in Applications Using {MPI} and Shared Memory
                 on an {IBM SP2}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1362",
  pages =        "189--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kemelmakher:1998:SAR,
  author =       "M. Kemelmakher and O. Kremien",
  title =        "Scalable and Adaptive Resource Sharing in {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "196--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kranzlmueller:1998:DPP,
  author =       "D. Kranzlmueller and J. Volkert",
  title =        "Debugging Point-to-Point Communication in {MPI} and
                 {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "265--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kuhn:1998:FFW,
  author =       "Bob Kuhn",
  title =        "{Fortran Futures}: Workshop: {OpenMP} for Parallel
                 {Fortran} Applications",
  journal =      j-FORTRAN-FORUM,
  volume =       "17",
  number =       "3",
  pages =        "22--22",
  month =        dec,
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1061-7264 (print), 1931-1311 (electronic)",
  ISSN-L =       "1061-7264",
  bibdate =      "Thu Feb 07 06:54:12 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran-forum.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Fortran Forum",
  issue =        "53",
}

@Article{Lavi:1998:IPD,
  author =       "R. Lavi and A. Barak",
  title =        "Improving the {PVM} Daemon Network Performance by
                 Direct Network Access",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "44--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Leung:1998:PAN,
  author =       "Ka-Cheong Leung and Mounir Hamdi",
  title =        "Performance assessment of network protocols and
                 parallel programming tools for distributed computing
                 systems",
  journal =      j-INT-J-COMPUT-SYST-SCI-ENG,
  volume =       "13",
  number =       "1",
  pages =        "67--80",
  month =        jan,
  year =         "1998",
  CODEN =        "CSSEEI",
  ISSN =         "0267-6192",
  bibdate =      "Thu Feb 4 13:21:32 MST 1999",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib; OCLC
                 Contents1st database",
  acknowledgement = ack-nhfb,
  affiliation =  "Hong Kong Univ of Science and Technology",
  affiliationaddress = "Kowloon, Hong Kong",
  classification = "722.2; 722.3; 722.4; 723.1; 723.2; 723.5",
  fjournal =     "International Journal of Computer Systems Science and
                 Engineering",
  journalabr =   "Comput Syst Sci Eng",
  keywords =     "Communication overhead; Computer aided software
                 engineering; Computer programming; Computer
                 workstations; Data communication systems; Distributed
                 computer systems; Ethernet; Fiber distributed data
                 interface; Interfaces (computer); Local area networks;
                 Mathematical models; Network protocols; Parallel
                 processing systems; Software Package Express; Software
                 Package PVM",
}

@Article{Lockey:1998:CRM,
  author =       "P. Lockey and R. Proctor and I. D. James",
  title =        "Characterization of {I/O} Requirements in a Massively
                 Parallel Shelf Sea Model",
  journal =      j-IJHPCA,
  volume =       "12",
  number =       "3",
  pages =        "320--332",
  month =        "Fall",
  year =         "1998",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/109434209801200302",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Nov 6 09:20:17 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434209801200302",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      sep,
}

@InProceedings{Lu:1998:ONW,
  author =       "Honghui Lu and Y. Charlie Hu and Willy Zwaenepoel",
  title =        "{OpenMP} on Networks of Workstations",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Mar 06 06:32:51 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing98.bib;
                 http://www.supercomp.org/sc98/papers/",
  URL =          "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Lu1105/index.htm",
  acknowledgement = ack-nhfb,
}

@Article{Mackay:1998:SPF,
  author =       "David Mackay and G. Mahinthakumar and Ed D'Azevedo",
  title =        "A Study of {I/O} in a Parallel Finite Element
                 Groundwater Transport Code",
  journal =      j-IJHPCA,
  volume =       "12",
  number =       "3",
  pages =        "307--319",
  month =        "Fall",
  year =         "1998",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/109434209801200301",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Nov 6 09:20:17 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434209801200301",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      sep,
}

@Article{Mamontov:1998:AES,
  author =       "Y. V. Mamontov and M. Willander",
  title =        "An Algorithm to Evaluate Spectral Densities of
                 High-Dimensional Stationary Diffusion Stochastic
                 Processes with Non-linear Coefficients: The General
                 Scheme and Issues on Implementation with {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1541",
  pages =        "315--321",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Sep 15 10:01:31 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1998b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "applied parallel computing; computing science; PARA;
                 parallel computing",
}

@Article{Mans:1998:PDP,
  author =       "Bernard Mans",
  title =        "Portable distributed priority queues with {MPI}",
  journal =      j-CPE,
  volume =       "10",
  number =       "3",
  pages =        "175--198",
  month =        mar,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:39 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5373;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=5373&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Marinho:1998:WMP,
  author =       "J. Marinho and J. G. Silva",
  title =        "{WMPI} --- Message Passing Interface for {Win32}
                 Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "113--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Martins:1998:JIW,
  author =       "P. Martins and L. M. Silva and J. Silva",
  title =        "A {Java} Interface for {WMPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "121--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Medeiros:1998:IPM,
  author =       "P. D. Medeiros and J. C. Cunha",
  title =        "Interconnecting {PVM} and {MPI} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "105--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Megson:1998:CRH,
  author =       "G. M. Megson and R. S. Fish and D. N. J. Clarke",
  title =        "Creation of Reconfigurable Hardware Objects in {PVM}
                 Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "215--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{MF:1998:SIM,
  author =       "{MPI Forum}",
  title =        "Special Issue: {MPI2}: a Message-Passing Interface
                 Standard",
  journal =      j-IJHPCA,
  volume =       "12",
  number =       "1--2",
  pages =        "1--299",
  month =        "Spring--Summer",
  year =         "1998",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Apr 8 15:55:29 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Morimoto:1998:IMM,
  author =       "K. Morimoto and T. Matsumoto and K. Hiraki",
  title =        "Implementing {MPI} with the Memory-Based Communication
                 Facilities on the {SSS-CORE} Operating System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "223--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{MPIForum:1998:SIM,
  author =       "{MPI Forum}",
  title =        "Special Issue: {MPI2}: a Message-Passing Interface
                 Standard",
  journal =      j-IJSAHPC,
  volume =       "12",
  number =       "1--2",
  pages =        "1--299",
  month =        "Spring--Summer",
  year =         "1998",
  CODEN =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Wed Apr 8 15:55:29 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Supercomputer Applications
                 and High Performance Computing",
}

@Article{Neophytou:1998:NDJ,
  author =       "N. Neophytou and P. Evripidou",
  title =        "{Net-dbx}: a {Java} Powered Tool for Interactive
                 Debugging of {MPI} Programs Across the {Internet}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1470",
  pages =        "181--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nieplocha:1998:CHP,
  author =       "Jarek Nieplocha and Ian Foster and Rick A. Kendall",
  title =        "{ChemIO}: High Performance Parallel {I/O} for
                 Computational Chemistry Applications",
  journal =      j-IJHPCA,
  volume =       "12",
  number =       "3",
  pages =        "345--363",
  month =        "Fall",
  year =         "1998",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/109434209801200304",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Nov 6 09:20:17 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434209801200304",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      sep,
}

@Article{Nitsche:1998:FMP,
  author =       "T. Nitsche and W. Webers",
  title =        "Functional Message Passing with {OPAL-MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "281--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Oldfield:1998:EPS,
  author =       "Ron A. Oldfield and David E. Womble and Curtis C.
                 Ober",
  title =        "Efficient Parallel {I/O} in Seismic Processing",
  journal =      j-IJHPCA,
  volume =       "12",
  number =       "3",
  pages =        "333--344",
  month =        "Fall",
  year =         "1998",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/109434209801200303",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Nov 6 09:20:17 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434209801200303",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      sep,
}

@Article{Orlando:1998:MBR,
  author =       "S. Orlando and R. Perego",
  title =        "An {MPI}-based Run-Time Support to Coordinate {HPF}
                 Tasks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "289--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Papadopoulos:1998:DVS,
  author =       "P. M. Papadopoulos and J. A. Kohl",
  title =        "Dynamic Visualization and Steering Using {PVM} and
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "297--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Poggi:1998:UPD,
  author =       "Agostino Poggi and Giulio Destri",
  title =        "Using {PVM} to Develop a Distributed Object-Oriented
                 Language for Heterogeneous Processing",
  journal =      j-J-SYST-SOFTW,
  volume =       "40",
  number =       "2",
  pages =        "139--150",
  month =        feb,
  year =         "1998",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Thu Sep 9 07:30:16 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/01641212",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of systems and software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212",
}

@Article{Rabenseifner:1998:MGI,
  author =       "R. Rabenseifner",
  title =        "{MPI-GLUE}: Interoperable High-Performance {MPI}
                 Combining Different Vendor's {MPI} Worlds",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1470",
  pages =        "563--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Reussner:1998:SDA,
  author =       "R. Reussner and P. Sanders and L. Prechelt and M.
                 Mueller",
  title =        "{SKaMPI}: a Detailed, Accurate {MPI} Benchmark",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "52--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Scott:1998:PWN,
  author =       "S. L. Scott and M. Fischer and A. Geist",
  title =        "{PVM} on {Windows} and {NT} Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "231--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sevenich:1998:PPU,
  author =       "Richard Sevenich",
  title =        "Parallel Processing Using {PVM}",
  journal =      j-LINUX-J,
  volume =       "45",
  pages =        "??--??",
  month =        jan,
  year =         "1998",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Fri Oct 9 08:35:26 MDT 1998",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue45/index.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Turn your networked computers into a virtual
                 machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Simitci:1998:CLP,
  author =       "Huseyin Simitci and Daniel A. Reed",
  title =        "A Comparison of Logical and Physical Parallel {I/O}
                 Patterns",
  journal =      j-IJHPCA,
  volume =       "12",
  number =       "3",
  pages =        "364--380",
  month =        "Fall",
  year =         "1998",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/109434209801200305",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Nov 6 09:20:17 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434209801200305",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      sep,
}

@Book{Snir:1998:MCR,
  author =       "Marc Snir and Steve W. Otto and Steven Huss-Lederman
                 and David W. Walker and Jack Dongarra",
  title =        "{MPI}: The Complete Reference. Volume 1, The {MPI-1}
                 Core",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  edition =      "Second",
  pages =        "450",
  month =        sep,
  year =         "1998",
  ISBN =         "0-262-69215-5 (vol. 1), 0-262-69216-3 (set)",
  ISBN-13 =      "978-0-262-69215-1 (vol. 1), 978-0-262-69216-8 (set)",
  LCCN =         "QA76.642 .M65 1998",
  bibdate =      "Thu Oct 29 07:27:43 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See also volume 2 \cite{Gropp:1998:MCR}.",
  price =        "US\$35 (paperback)",
  series =       "Scientific and Engineering Computation",
  URL =          "http://mitpress.mit.edu/book-home.tcl?isbn=0262692155",
  acknowledgement = ack-nhfb,
}

@MastersThesis{Stockinger:1998:VPC,
  author =       "Kurt Stockinger",
  title =        "{ViMPIOS} --- a portable, client-server based
                 implementation of {MPI-IO} on {ViPIOS}",
  type =         "{Diplom-Arbeit}",
  school =       "Universit{\"a}t Wien",
  address =      "Vienna, Austria",
  pages =        "155",
  year =         "1998",
  bibdate =      "Wed Aug 27 07:21:00 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Thakur:1998:CUM,
  author =       "Rajeev S. Thakur",
  title =        "A Case for Using {MPI}'s Derived Datatypes to Improve
                 {I/O} Performance",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Oct 07 08:50:26 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}

@Article{Topol:1998:PTV,
  author =       "Brad Topol and John T. Stasko and Vaidy Sunderam",
  title =        "{PVaniM}: a tool for visualization in network
                 computing environments",
  journal =      j-CPE,
  volume =       "10",
  number =       "14",
  pages =        "1197--1222",
  day =          "10",
  month =        dec,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:45 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=40005932;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=40005932&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Tourino:1998:PBL,
  author =       "J. Touri{\~n}o and R. Doallo",
  title =        "A {PVM}-Based Library for Sparse Matrix
                 Factorizations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "304--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Traeff:1998:PRL,
  author =       "J. L. Traeff",
  title =        "Portable Randomized List Ranking on Multiprocessors
                 Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "395--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wismueller:1998:LMS,
  author =       "R. Wismueller",
  title =        "On-Line Monitoring Support in {PVM} and {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "312--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Yalamanchilli:1998:CPJ,
  author =       "Narendar Yalamanchilli and William Cohen",
  title =        "Communication Performance of {Java} based {Parallel
                 Virtual Machines}",
  crossref =     "ACM:1998:AWJ",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Thu Apr 27 10:43:08 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.cs.ucsb.edu/conferences/java98/papers/passing.pdf;
                 http://www.cs.ucsb.edu/conferences/java98/papers/passing.ps",
  acknowledgement = ack-nhfb,
}

@Article{Zhou:1998:LST,
  author =       "Honbo Zhou and Al Geist",
  title =        "{LPVM}: a step towards multithread {PVM}",
  journal =      j-CPE,
  volume =       "10",
  number =       "5",
  pages =        "407--416",
  day =          "25",
  month =        apr,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:40 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5385;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=5385&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@InProceedings{Alexandrov:1999:PMC,
  author =       "V. Alexandrov and A. Karaivanova",
  title =        "Parallel {Monte Carlo} algorithms for sparse {SLAE}
                 using {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "283--290",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Anonymous:1999:BRMa,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{MPI --- The complete
                 reference: Volume 1, the MPI core}}, second edition: By
                 Marc Snir, Steve Otto, Steven Huss-Lederman, David
                 Walker and Jack Dongarra. MIT Press, Cambridge, MA.
                 (1998). 426 pages. \$35.00}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "37",
  number =       "3",
  pages =        "130--130",
  month =        feb,
  year =         "1999",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:57 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122199903590",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Anonymous:1999:BRMb,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{MPI-The complete reference:
                 Volume 2, the MPI-2 extensions}}: By William Gropp,
                 Steven Huss-Lederman, Andrew Lumsdaine, Ewing Lusk,
                 Bill Nitzberg, William Saphir and Marc Snir. MIT Press,
                 Cambridge, MA. (1998). 344 pages. \$35.00}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "37",
  number =       "3",
  pages =        "130--130",
  month =        feb,
  year =         "1999",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:57 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122199903619",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Anonymous:1999:BRMf,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{MPI --- The complete
                 reference: Volume 1, the MPI core}}, second edition: By
                 Marc Snir, Steve Otto, Steven Huss-Lederman, David
                 Walker and Jack Dongarra. MIT Press, Cambridge, MA
                 (1998). 426 pages. \$35.00}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "37",
  number =       "6",
  pages =        "130--130",
  month =        mar,
  year =         "1999",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:58 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122199902237",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Anonymous:1999:BRMg,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{MPI-The complete reference:
                 Volume 2, the MPI-2 extensions}}: By William Gropp,
                 Steven Huss-Lederman, Andrew Lumsdaine, Ewing Lusk,
                 Bill Nitzberg, William Saphir and Marc Snir. MIT Press,
                 Cambridge, MA. (1998). 344 pages. \$35.00}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "37",
  number =       "6",
  pages =        "130--130",
  month =        mar,
  year =         "1999",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:48:58 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122199902250",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@InProceedings{Asai:1999:MIF,
  author =       "Noboru Asai and Thomas Kentemich and Pierre Lagier",
  title =        "{MPI-2} Implementation on a {Fujitsu Generic Message
                 Passing Kernel}",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ayguade:1999:EML,
  author =       "E. Ayguade and X. Martorell and J. Labarta and M.
                 Gonzalez and N. Navarro",
  editor =       "????",
  booktitle =    "{Proceedings of the 1999 International Conference on
                 Parallel Processing}",
  title =        "Exploiting multiple levels of parallelism in {OpenMP}:
                 a case study",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "172--180",
  year =         "1999",
  bibdate =      "Mon Oct 07 08:57:41 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Badia:1999:SIT,
  author =       "J. M. Badia and A. M. Vidal",
  title =        "Solving the inverse {Toeplitz} eigenproblem using
                 {ScaLAPACK} and {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "372--379",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Baker:1999:MOO,
  author =       "M. Baker and B. Carpenter and G. Fox and Sung Hoon
                 Koo",
  title =        "{mpiJava}: An Object-Oriented {Java} Interface to
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1586",
  pages =        "748--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Baraglia:1999:PAN,
  author =       "R. Baraglia and R. Ferrini and D. Laforenza and A.
                 Lagana",
  title =        "Parallel approaches to a numerically intensive
                 application using {PVM}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "364--371",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Barbosa:1999:ADM,
  author =       "J. Barbosa and A. Padilha",
  title =        "Algorithm-Dependant Method to Determine the Optimal
                 Number of Computers in Parallel Virtual Machines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1573",
  pages =        "508--521",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 14 06:09:05 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "parallel processing; VECPAR; vector processing",
}

@Article{Barnard:1999:MIS,
  author =       "Stephen T. Barnard and Luis M. Bernardo and Horst D.
                 Simon",
  title =        "An {MPI} Implementation of the {SPAI} Preconditioner
                 on the {T3E}",
  journal =      j-IJHPCA,
  volume =       "13",
  number =       "2",
  pages =        "107--123",
  month =        "Summer",
  year =         "1999",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri May 21 13:56:09 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@InProceedings{Bassomo:1999:PGE,
  author =       "P. Bassomo and I. Sakho and A. Corbel",
  title =        "Porting generalized eigenvalue software on distributed
                 memory machines using systolic model principles",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "396--403",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Bernaschi:1999:ERA,
  author =       "M. Bernaschi and G. Iannello and M. Lauria",
  title =        "Experimental Results about {MPI} Collective
                 Communication Operations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1593",
  pages =        "774--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Bertozzi:1999:MIT,
  author =       "M. Bertozzi and F. Boselli and G. Conte and M.
                 Reggiani",
  title =        "An {MPI} implementation on the top of the virtual
                 interface architecture",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "199--206",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Beyls:1999:JJP,
  author =       "K. Beyls and E. D'Hollander and Y. Yu",
  title =        "{JPT}: a {Java} parallelization tool",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "173--180",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Blaheta:1999:LFM,
  author =       "R. Blaheta and O. Jakl and J. Stary",
  title =        "Large-scale {FE} modelling in geomechanics: a case
                 study in parallelization",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "299--306",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Borkowski:1999:LVC,
  author =       "J. Borkowski",
  title =        "On line visualization or combining the standard {ORNL
                 PVM} with a vendor {PVM} implementation",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "157--164",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Boudet:1999:PIH,
  author =       "V. Boudet and F. Rastello and Y. Robert",
  title =        "{PVM} implementation of heterogeneous {ScaLAPACK}
                 dense linear solvers",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "333--340",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bova:1999:NOM,
  author =       "S. W. Bova and C. P. Breshears and C. Cuicchi and Z.
                 Demirbilek and H. Gabb",
  editor =       "????",
  booktitle =    "{Proceedings of the ISCA 12th International
                 Conference. Parallel and Distributed Systems}",
  title =        "Nesting {OpenMP} in an {MPI} application.",
  publisher =    "ISCA",
  address =      "Raleigh, NC, USA",
  pages =        "566--571",
  year =         "1999",
  bibdate =      "Mon Oct 07 09:02:21 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Bova:1999:PPM,
  author =       "Steve W. Bova and Clay P. Breshears and Henry Gabb and
                 Rudolf Eigenmann and Greg Gaertner and Bob Kuhn and
                 Bill Magro and Stefano Salvini",
  title =        "Parallel Programming with Message Passing and
                 Directives",
  journal =      j-SIAM-NEWS,
  volume =       "32",
  number =       "9",
  pages =        "??--??",
  month =        nov,
  year =         "1999",
  ISSN =         "0036-1437",
  bibdate =      "Mon Oct 07 09:13:31 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM News",
  journal-URL =  "http://www.siam.org/news/",
}

@InProceedings{Bubak:1999:EFP,
  author =       "M. Bubak and W. Funika and K. Iskra and R.
                 Maruszewski",
  title =        "Enhancing the functionality of performance measurement
                 tools for message passing environments",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "67--74",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bubak:1999:TPR,
  author =       "M. Bubak and P. Luszczek",
  title =        "Towards portable runtime support for irregular and
                 out-of-core computations",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "59--66",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Cappello:1999:PNB,
  author =       "F. Cappello and O. Richard and D. Etiemble",
  title =        "Performance of the {NAS} Benchmarks on a Cluster of
                 {SMP PCs} Using a Parallelization of the {MPI} Programs
                 with {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1662",
  pages =        "339--350",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cerin:1999:DMP,
  author =       "C. Cerin",
  title =        "Differentiating Message Passing Interface and Bulk
                 Synchronous Parallel Computation Models",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1662",
  pages =        "477--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Chapman:1999:EOF,
  author =       "B. Chapman and P. Mehrotra and H. Zima",
  editor =       "????",
  booktitle =    "{Proceedings of Eighth ECMWF Workshop on the Use of
                 Parallel Processors in Meteorology. Towards
                 Teracomputing}",
  title =        "Enhancing {OpenMP} with features for locality
                 control",
  publisher =    pub-WORLD-SCI,
  address =      pub-WORLD-SCI:adr,
  pages =        "301--313",
  year =         "1999",
  bibdate =      "Mon Oct 07 09:10:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{ChassindeKergommeaux:1999:MER,
  author =       "J. {Chassin de Kergommeaux} and M. Ronsse and K. {De
                 Bosschere}",
  title =        "{MPL0*}: {Efficient} record\slash replay of
                 nondeterministic features of message passing
                 libraries",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "141--148",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Chaussumier:1999:ACM,
  author =       "F. Chaussumier and F. Desprez and L. Prylli",
  title =        "Asynchronous communications in {MPI} --- The
                 {BIP\slash Myrinet} approach",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "485--492",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Chergui:1999:UPP,
  author =       "J. Chergui",
  title =        "Using {PMD} to parallel solve large-scale
                 {Navier--Stokes} equations. Performance analysis on
                 {SGI\slash CRAY-T3E} machine",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "341--348",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Chien:1999:DEH,
  author =       "A. Chien and M. Lauria and R. Pennington and M.
                 Showerman and G. Iannello and M. Buchanan and K.
                 Connelly and L. Giannini and G. Koenig and S.
                 Krishnamurthy and Q. Liu and S. Pakin and G.
                 Sampemane",
  title =        "Design and Evaluation of an {HPVM}-Based {Windows NT}
                 Supercomputer",
  journal =      j-IJHPCA,
  volume =       "13",
  number =       "3",
  pages =        "201--219",
  month =        "Fall",
  year =         "1999",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Jul 28 14:14:38 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@InProceedings{Ciegis:1999:HDA,
  author =       "R. Ciegis and R. Sablinskas and J. Wasniewski",
  title =        "Hyper-Rectangle distribution algorithm for parallel
                 multidimensional numerical integration",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "275--282",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Claver:1999:PCS,
  author =       "J. M. Claver and M. Mollar and V. Hernandez",
  title =        "Parallel computation of the {SVD} of a matrix
                 product",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "388--395",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Clematis:1999:EPC,
  author =       "A. Clematis and V. Gianuzzi",
  title =        "Extending {PVM} with consistent cut capabilities:
                 {Application} aspects and implementation strategies",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "101--108",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Corbacho-Lozano:1999:EDD,
  author =       "J. Corbacho-Lozano and O.-I. Lepe-Aldama and J.
                 Sole-Pareta and J. Domingo-Pascual",
  title =        "Experiences deploying a distributed parallel
                 processing environment over a broadband multiservice
                 network",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "477--484",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Cormen:1999:PBP,
  author =       "Thomas H. Cormen and James C. Clippinger",
  title =        "Performing {BMMC} Permutations Efficiently on
                 Distributed-Memory Multiprocessors with {MPI}",
  journal =      j-ALGORITHMICA,
  volume =       "24",
  number =       "3--4",
  pages =        "349--370",
  month =        aug,
  year =         "1999",
  CODEN =        "ALGOEJ",
  ISSN =         "0178-4617 (print), 1432-0541 (electronic)",
  ISSN-L =       "0178-4617",
  MRclass =      "68Q22",
  MRnumber =     "MR1687275",
  bibdate =      "Fri Jan 6 11:38:11 MST 2006",
  bibsource =    "dblp-journals-algorithmica.bib;
                 http://dblp.uni-trier.de/db/journals/algorithmica/algorithmica24.html#CormenC99;
                 http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0178-4617&volume=24&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/index-table-a.html#algorithmica;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 MathSciNet database",
  URL =          "http://link.springer.de/link/service/journals/00453/bibs/24n3p349.html;
                 http://www.springerlink.com/openurl.asp?genre=article&issn=0178-4617&volume=24&issue=3&spage=349",
  acknowledgement = ack-nhfb,
  fjournal =     "Algorithmica. An International Journal in Computer
                 Science",
  journal-URL =  "http://link.springer.com/journal/453",
  oldlabel =     "CormenC99",
  XMLdata =      "ftp://ftp.informatik.uni-trier.de/pub/users/Ley/bib/records.tar.gz#journals/algorithmica/CormenC99",
}

@InProceedings{Cownie:1999:SID,
  author =       "J. Cownie and W. Gropp",
  title =        "A standard interface for debugger access to message
                 queue information in {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "51--58",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Czarnul:1999:DAP,
  author =       "P. Czarnul and H. Krawczyk",
  title =        "Dynamic assignment with process migration in
                 distributed environments",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "509--516",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Dan:1999:QAM,
  author =       "Pei Dan and Wang Dongsheng and Zhang Youhui and Shen
                 Meiming",
  title =        "Quasi-asynchronous migration: a novel migration
                 protocol for {PVM} tasks",
  journal =      j-OPER-SYS-REV,
  volume =       "33",
  number =       "2",
  pages =        "5--14",
  month =        apr,
  year =         "1999",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:42 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@InProceedings{DeSande:1999:NBS,
  author =       "F. {De Sande} and C. Leon and C. Rodriguez and J.
                 Roda",
  title =        "Nested bulk synchronous parallel computing",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "189--198",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Eberl:1999:PCP,
  author =       "M. Eberl and W. Karl and C. Trinitis and A.
                 Blaszczyk",
  title =        "Parallel computing on {PC} clusters --- an alternative
                 to supercomputers for industrial applications",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "493--498",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Eickermann:1999:PID,
  author =       "T. Eickermann and H. Grund and J. Henrichs",
  title =        "Performance issues of distributed {MPI} applications
                 in a {German} gigabit testbed",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "3--10",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Espinosa:1999:REB,
  author =       "A. Espinosa and F. Parcerisa and T. Margalef and E.
                 Luque",
  title =        "Relating the execution behaviour with the structure of
                 the application",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "91--100",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Fang:1999:PMD,
  author =       "Zhiwu Fang and A. D. J. Haymet and Wataru Shinoda and
                 Susumu Okazaki",
  title =        "Parallel molecular dynamics simulation: Implementation
                 of {PVM} for a lipid membrane",
  journal =      j-COMP-PHYS-COMM,
  volume =       "116",
  number =       "2--3",
  pages =        "295--310",
  month =        feb,
  year =         "1999",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(98)00089-7",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 21:30:34 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465598000897",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@InProceedings{Fava:1999:MPI,
  author =       "A. Fava and M. Fava and M. Bertozzi",
  title =        "{MPIPOV}: a parallel implementation of {POV-Ray} based
                 on {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "426--433",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ferenc:1999:VMK,
  author =       "D. Ferenc and J. Nabrzyski and M. Stroinski and P.
                 Wierzejewski",
  title =        "Visual {MPI}, a knowledge-based system for writing
                 efficient {MPI} applications",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "257--266",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Fernandez:1999:PGP,
  author =       "F. Fernandez and J. M. Sanchez and M. Tomassini and J.
                 A. Gomez",
  title =        "A parallel genetic programming tool based on {PVM}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "241--248",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Frugoli:1999:DCH,
  author =       "G. Frugoli and A. Fava and E. Fava and G. Conte",
  title =        "Distributed collision handling for particle-based
                 simulation",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "410--417",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Gallud:1999:CCU,
  author =       "J. A. Gallud and J. M. Garcia and J.
                 Garcia-Consuegra",
  title =        "Cluster computing using {MPI} and {Windows NT} to
                 solve the processing of remotely sensed imagery",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "442--449",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Gallud:1999:DPR,
  author =       "J. A. Gallud and J. Garcia-Consuegra and A. Martinez",
  title =        "Distributed Processing of Remotely Sensed {Landsat-TM}
                 Imagery Using {MPI}",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "2",
  number =       "2",
  pages =        "??--??",
  month =        "????",
  year =         "1999",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Fri Dec 19 08:14:13 MST 2003",
  bibsource =    "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2abs.html#gallud",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@InProceedings{Garcia:1999:MMI,
  author =       "F. Garcia and A. Calderon and J. Carretero",
  title =        "{MiMPI}: a multithread-safe implementation of {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "207--214",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Garzon:1999:PIE,
  author =       "E. M. Garzon and I. Garcia",
  title =        "A parallel implementation of the eigenproblem for
                 large, symmetric and sparse matrices",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "380--387",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Getov:1999:MJM,
  author =       "Vladimir Getov and Paul Gray and Vaidy Sunderam",
  title =        "{MPI} and {Java-MPI}: Contrasts and Comparisons of
                 Low-level Communication Performance",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@Article{Giordano:1999:IBP,
  author =       "M. Giordano and M. M. Furnari and F. Vitobello",
  title =        "Interaction between {PVM} Parameters and Communication
                 Performances on {ATM} Networks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1557",
  pages =        "586--587",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 14 06:09:05 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "image processing; multimedia; parallel computation;
                 parallel computing; parallel numerics; ParNum",
}

@InProceedings{Godlevsky:1999:PSA,
  author =       "A. Godlevsky and M. Gazak and L. Hluchy",
  title =        "Parallelizing of sequential annotated programs in
                 {PVM} environment",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "517--524",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Golebiewski:1999:HPI,
  author =       "M. Golebiewski and M. Baum and R. Hempel",
  title =        "High Performance Implementation of {MPI} for {Myrinet}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1557",
  pages =        "510--521",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 14 06:09:05 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "image processing; multimedia; parallel computation;
                 parallel computing; parallel numerics; ParNum",
}

@InProceedings{Gonzalez:1999:PPM,
  author =       "J. A. Gonzalez and C. Rodriguez and J. L. Roda and D.
                 G. Morales",
  title =        "Performance and predictability of {MPI} and {BSP}
                 programs on the {CRAY T3E}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "27--34",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Gropp:1999:RMM,
  author =       "W. Gropp and E. Lusk",
  title =        "Reproducible measurements of {MPI} performance
                 characteristics",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "11--18",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Book{Gropp:1999:UMA,
  author =       "William Gropp and Ewing Lusk and Rajeev Thakur",
  title =        "Using {MPI-2}: Advanced Features of the {Message
                 Passing Interface}",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "275",
  month =        nov,
  year =         "1999",
  ISBN =         "0-262-57133-1",
  ISBN-13 =      "978-0-262-57133-3",
  LCCN =         "QA76.642 .G762 1999",
  bibdate =      "Fri Feb 01 06:52:50 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "US\$32.50",
  series =       "Scientific and Engineering Computation",
  URL =          "http://www.mitpress.com/book-home.tcl?isbn=0262571331",
  acknowledgement = ack-nhfb,
}

@Book{Gropp:1999:UMP,
  author =       "William Gropp and Ewing Lusk and Anthony Skjellum",
  title =        "Using {MPI}: Portable Parallel Programming with the
                 {Message Passing Interface}",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  edition =      "Second",
  pages =        "350",
  month =        nov,
  year =         "1999",
  ISBN =         "0-262-57132-3 (vol. 1), 0-262-57134-X (set)",
  ISBN-13 =      "978-0-262-57132-6 (vol. 1), 978-0-262-57134-0 (set)",
  LCCN =         "QA76.642.G76 1999",
  bibdate =      "Mon Sep 20 05:54:39 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "US\$32.50",
  series =       "Scientific and Engineering Computation",
  URL =          "http://www.mitpress.com/book-home.tcl?isbn=0262571323",
  acknowledgement = ack-nhfb,
}

@Article{Hempel:1999:AMP,
  author =       "Rolf Hempel and Falk Zimmermann",
  title =        "Automatic migration from {PARMACS} to {MPI} in
                 parallel {Fortran} applications",
  journal =      j-SCI-PROG,
  volume =       "7",
  number =       "1",
  pages =        "39--46",
  month =        "????",
  year =         "1999",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib; OCLC
                 Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=64cr5a4mg33tuhcbdr02%26referrer=parent%26backto=issue%2C3%2C7%3Bjournal%2C8%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Hidalgo:1999:MMP,
  author =       "J. I. Hidalgo and M. Prieto and J. Lanchares and F.
                 Tirado",
  title =        "A method for model parameter identification using
                 parallel genetic algorithms",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "291--298",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Hluchy:1999:GWF,
  author =       "L. Hluchy and V. D. Tran and L. Halada and M.
                 Dobrucky",
  title =        "Ground water flow modelling in {PVM}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "450--460",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Huse:1999:CCD,
  author =       "L. P. Huse",
  title =        "Collective communication on dedicated clusters of
                 workstations",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "469--476",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ishihara:1999:VBS,
  author =       "S. Ishihara and S. Tani and A. Takahara",
  title =        "Virtual {BUS}: a simple implementation of an
                 effortless networking system based on {PVM}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "461--468",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Kielmann:1999:MMC,
  author =       "Thilo Kielmann and Rutger F. H. Hofman and Henri E.
                 Bal and Aske Plaat and Raoul A. F. Bhoedjang",
  title =        "{MagPIe}: {MPI}'s collective communication operations
                 for clustered wide area systems",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "131--140",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p131-kielmann/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Koholka:1999:MPR,
  author =       "R. Koholka and H. Mayer and A. Goller",
  title =        "{MPI}-parallelized Radiance on {SGI CoW} and {SMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1557",
  pages =        "549--558",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 14 06:09:05 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "image processing; multimedia; parallel computation;
                 parallel computing; parallel numerics; ParNum",
}

@InProceedings{Kranzlmueller:1999:MOM,
  author =       "D. Kranzlmueller and R. Reussner and C.
                 Schaubschlaeger",
  title =        "Monitor overhead measurement with {SKaMPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "43--50",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Larsen:1999:SPG,
  author =       "M. Larsen and P. Madsen",
  title =        "A scalable parallel {Gauss--Seidel} and {Jacobi}
                 solver for animal genetics",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "356--363",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Lee:1999:PEJ,
  author =       "Bu-Sung Lee and Yan Gu and Wentong Cai and Alfred
                 Heng",
  title =        "Performance Evaluation of {JPVM}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "9",
  number =       "3",
  pages =        "401--??",
  month =        sep,
  year =         "1999",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Jan 6 12:02:35 MST 2005",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Luo:1999:SMV,
  author =       "Yong Luo",
  title =        "Shared Memory vs. Message Passing: The {COMOPS}
                 Benchmark Experiment",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "13",
  number =       "3",
  pages =        "283--301",
  month =        may,
  year =         "1999",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1023/A:1008009103962",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 6 12:13:10 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=13&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/issuetoc.htm/0920-8542+13+3+1999",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=13&issue=3&spage=283;
                 http://www.wkap.nl/oasis.htm/206582",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "distributed computing; message passing; MPI;
                 performance evaluation; shared memory",
}

@InProceedings{MacFarlane:1999:PPI,
  author =       "A. MacFarlane and J. A. McCann and S. E. Robertson",
  title =        "{PLIERS}: a parallel information retrieval system
                 using {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "317--324",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Matuszek:1999:BPG,
  author =       "M. R. Matuszek and A. Mazurkiewicz and P. W. Uminski",
  title =        "Benchmarking the {PVM} group communication
                 efficiency",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "499--508",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Mierendorff:1999:PMB,
  author =       "H. Mierendorff and H. Schwamborn",
  title =        "Performance modeling based on {PVM}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "75--82",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Migliardi:1999:PEH,
  author =       "M. Migliardi and V. Sunderam",
  title =        "{PVM} emulation in the harness metacomputing system: a
                 plug-in based approach",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "117--124",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Morimoto:1999:PEM,
  author =       "K. Morimoto and T. Matsumoto and K. Hiraki",
  title =        "Performance evaluation of the {MPI\slash MBCF} with
                 the {NAS} parallel benchmarks",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "19--26",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Morrison:1999:FPP,
  author =       "J. P. Morrison and R. W. Connolly",
  title =        "Facilitating parallel programming in {PVM} using
                 condensed graphs",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "181--188",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Mourao:1999:IMO,
  author =       "F. E. Mourao and J. G. Silva",
  title =        "Implementing {MPI}'s one-sided communications for
                 {WMPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "231--240",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Neyman:1999:ERP,
  author =       "M. Neyman and M. Bukowski and P. Kuzora",
  title =        "Efficient replay of {PVM} programs",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "83--90",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Nicolescu:1999:PWA,
  author =       "C. Nicolescu and B. Albers and P. Jonker",
  title =        "Parallel watershed algorithm on images from cranial
                 {CT-scans} using {PVM} and {MPI} on a distributed
                 memory system",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "418--425",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Papagapiou:1999:NWD,
  author =       "A. Papagapiou and P. Evripidou and G. Samaras",
  title =        "{Net-Console}: a {Web}-based development environment
                 for {MPI} programs",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "249--256",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Parrilia:1999:UPD,
  author =       "L. Parrilia and J. Ortega and A. Lloris",
  title =        "Using {PVM} for distributed logic minimization in a
                 network of computers",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "541--548",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Pereira:1999:PBI,
  author =       "N. S. A. Pereira",
  title =        "A Parallel {$N$}-body Integrator Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1573",
  pages =        "627--639",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 14 06:09:05 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "parallel processing; VECPAR; vector processing",
}

@InProceedings{Plazek:1999:IIC,
  author =       "J. Plazek and K. Banas and J. Kitowski",
  title =        "Implementation issues of computational fluid dynamics
                 algorithms on parallel computers",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "349--355",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Prieto:1999:PRM,
  author =       "M. Prieto and R. Santiago and I. M. Llorente and F.
                 Tirado",
  title =        "A parallel robust multigrid algorithm based on
                 semi-coarsening",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "307--316",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Prylli:1999:DHP,
  author =       "L. Prylli and B. Tourancheau and R. Westrelin",
  title =        "The design for a high performance {MPI} implementation
                 on the {Myrinet} network",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "223--230",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Rabenseifner:1999:APM,
  author =       "R. Rabenseifner",
  title =        "Automatic profiling of {MPI} applications with
                 hardware performance counters",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "35--42",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Radhakrishna:1999:MBP,
  author =       "H. Radhakrishna and S. Divakar and N. Magotra and S.
                 R. J. Brueck",
  title =        "{MPI}-Based Parallel Implementation of a Lithography
                 Pattern Simulation Algorithm",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1593",
  pages =        "109--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Roe:1999:PMI,
  author =       "Kevin Roe and Piyush Mehrotra",
  title =        "Parallelization of a multigrid incompressible viscous
                 cavity flow solver using {openMP}",
  type =         "{NASA} contractor report",
  number =       "NASA\slash CR-1999-209551",
  institution =  inst-NLRC,
  address =      inst-NLRC:adr,
  pages =        "????",
  year =         "1999",
  bibdate =      "Thu Mar 16 07:20:02 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Also ICASE report 99-36.",
  acknowledgement = ack-nhfb,
  keywords =     "cavity flow; incompressible flow; multigrid methods;
                 two dimensional flow; viscous flow",
}

@InProceedings{Rungsawang:1999:PDT,
  author =       "A. Rungsawang and A. Tangpong and P. Laohawee",
  title =        "Parallel {DSIR} text retrieval system",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "325--332",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Russ:1999:UHR,
  author =       "Samuel H. Russ and Jonathan Robinson and Matt Gleeson
                 and Brad Meyers and Laxman Rajagopalan and Chun-Heong
                 Tan",
  title =        "Using {Hector} to run {MPI} programs over networked
                 workstations",
  journal =      j-CPE,
  volume =       "11",
  number =       "4",
  pages =        "189--204",
  day =          "10",
  month =        apr,
  year =         "1999",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:48 MDT 1999",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  note =         "Special Issue: Applications of Distributed Computing
                 Environments.",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=61004080;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=61004080&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Scherer:1999:TAP,
  author =       "Alex Scherer and Honghui Lu and Thomas Gross and Willy
                 Zwaenepoel",
  title =        "Transparent adaptive parallelism on {NOWs} using
                 {OpenMP}",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "96--106",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan1990.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p96-scherer/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@InProceedings{Schuele:1999:HAP,
  author =       "J. Schuele",
  title =        "Heading for an asynchronous parallel ocean model",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "404--409",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@MastersThesis{Seifert:1999:ESI,
  author =       "Friedrich Seifert",
  title =        "{Entwicklung von Systemsoftware zur Integration der
                 Virtual InterfaceArchitecture (VIA) in den Linux
                 Betriebssystemkern f{\"u}r optimiertes MessagePassing}.
                 ({German}) [{Development} of system software for
                 integration of the {Virtual InterfaceArchitecture
                 (VIA)} in the {Linux} operating system for optimized
                 message passing]",
  type =         "{Diplomarbeit}",
  school =       "Technische Universit{\"a}t Chemnitz-Zwickau",
  address =      "Chemnitz, Germany",
  pages =        "115",
  year =         "1999",
  bibdate =      "Wed Aug 27 06:25:09 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  language =     "German",
}

@Article{Sen:1999:PBD,
  author =       "Vikramaditya Sen and Mrinal K. Sen and Paul L.
                 Stoffa",
  title =        "{PVM} based {$3$-D Kirchhoff} depth migration using
                 dynamically computed travel-times: an application in
                 seismic data processing",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "25",
  number =       "3",
  pages =        "231--248",
  day =          "22",
  month =        mar,
  year =         "1999",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:16:02 MDT 1999",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1999&volume=25&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/cas/tree/store/parco/sub/1999/25/3/1389.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@InProceedings{Shen:1999:ATL,
  author =       "Kai Shen and Hong Tang and Tao Yang",
  title =        "Adaptive Two-level Thread Management for Fast {MPI}
                 Execution on Shared Memory Machines",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@Article{Sidonio:1999:PBI,
  author =       "N. Sidonio and A. Pereira",
  title =        "A Parallel {$N$}-body Integrator Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1573",
  pages =        "627--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Silva:1999:DPP,
  author =       "F. Silva and H. Paulino and L. Lopes",
  title =        "{DipSystem}: a parallel programming system for
                 distributed memory architectures",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "525--532",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Silva:1999:IME,
  author =       "P. Silva and J. G. Silva",
  title =        "Implementing {MPI-2} extended collective operations",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "125--132",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sistare:1999:MSP,
  author =       "Steve Sistare and Erica Dorenkamp and Nick Nevin",
  title =        "{MPI} Support in the {Prism} Programming Environment",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sistare:1999:OMC,
  author =       "Steve Sistare and Rolf vandeVaart and Eugene Loh",
  title =        "Optimization of {MPI} Collectives on Clusters of
                 Large-scale {SMPs}",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@InProceedings{Stankovic:1999:NVJ,
  author =       "N. Stankovic and K. Zhang",
  title =        "Native versus {Java} message passing",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "165--172",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Szeberenyi:1999:SGB,
  author =       "I. Szeberenyi and G. Domokos",
  title =        "Solving generalized boundary value problems with
                 distributed computing and recursive programming",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "267--274",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Takahashi:1999:IEM,
  author =       "T. Takahashi and F. O'Carroll and H. Tezuka and A.
                 Hori",
  title =        "Implementation and Evaluation of {MPI} on an {SMP}
                 Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1586",
  pages =        "1178--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tang:1999:CRT,
  author =       "Hong Tang and Kai Shen and Tao Yang",
  title =        "Compile\slash run-time support for threaded {MPI}
                 execution on multiprogrammed shared memory machines",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "107--118",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p107-tang/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Throop:1999:SOS,
  author =       "Joe Throop",
  title =        "Standards: {OpenMP}: Shared-Memory Parallelism from
                 the Ashes",
  journal =      j-COMPUTER,
  volume =       "32",
  number =       "5",
  pages =        "108--109",
  month =        may,
  year =         "1999",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Thu May 6 06:17:23 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computer1990.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/co/books/co1999/pdf/r5108.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@InProceedings{Tourino:1999:MMC,
  author =       "J. Touri{\~n}o and R. Doallo",
  title =        "Modeling {MPI} collective communications on the
                 {AP3000 Multicomputer}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "133--140",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Traeff:1999:FFE,
  author =       "J. L. Traeff and R. Hempel and H. Ritzdoff and F.
                 Zimmermann",
  title =        "Flattening on the fly: {Efficient} handling of {MPI}
                 derived datatypes",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "109--116",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Vazquez:1999:PNS,
  author =       "G. E. Vazquez and N. B. Brignole",
  title =        "Parallel {NLP} strategies using {PVM} on heterogeneous
                 distributed environments",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "533--540",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Wisniewski:1999:SME,
  author =       "Len Wisniewski and Brad Smisloff and Nils Nieuwejaar",
  title =        "{Sun MPI I/O}: Efficient {I/O} for Parallel
                 Applications",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@InProceedings{Wong:1999:BMM,
  author =       "F. C. Wong and A. C. Arpaci-Dusseau and D. E. Culler",
  title =        "Building {MPI} for multi-programming systems using
                 implicit information",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "215--222",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Wu:1999:JBD,
  author =       "X. Wu and Q. Chen and X.-H. Sun",
  title =        "A {Java}-based Distributed Debbuger Supporting {MPI}
                 and {PVM}",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "2",
  number =       "4",
  pages =        "??--??",
  month =        "????",
  year =         "1999",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Fri Dec 19 08:14:14 MST 2003",
  bibsource =    "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no4.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no4abs.html#wu",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@InProceedings{Wu:1999:MCC,
  author =       "P.-Y. Wu",
  title =        "Minimum communication cost fractal image compression
                 on {PVM}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "434--441",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Zaki:1999:TSP,
  author =       "Omer Zaki and Ewing Lusk and William Gropp and Deborah
                 Swider",
  title =        "Toward Scalable Performance Visualization with
                 {Jumpshot}",
  journal =      j-IJHPCA,
  volume =       "13",
  number =       "3",
  pages =        "277--288",
  month =        "Fall",
  year =         "1999",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Jul 28 14:14:38 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  keywords =     "Java; MPI (Message Passing Interface) profiling",
}

@Article{Zoraja:1999:SPD,
  author =       "Ivan Zoraja and Hermann Hellwagner and Vaidy
                 Sunderam",
  title =        "{SCIPVM}: {Parallel} distributed computing on {SCI}
                 workstation clusters",
  journal =      j-CPE,
  volume =       "11",
  number =       "3",
  pages =        "121--138",
  month =        mar,
  year =         "1999",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:47 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=61003667;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=61003667&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Misc{Beguelin:19xx:PSS,
  author =       "A. Beguelin and J. J. Dongarra and G. A. Geist and R.
                 Manchek and V. S. Sunderam",
  title =        "{PVM} Software System and Documentation",
  howpublished = "Email to {\tt netlib@ornl.gov}",
  month =        "????",
  year =         "19xx",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 Parallel/Par.Arch.Indep.bib",
}

@TechReport{Geist:19xx:NBC,
  author =       "G. A. Geist and V. S. Sunderam",
  title =        "Network Based Concurrent Computing on the {PVM}
                 System",
  institution =  inst-ORNL # " and " # inst-EMORY,
  address =      inst-ORNL:adr # " and " # inst-EMORY:adr,
  year =         "19xx",
  bibsource =    "Distributed/Dist.Sys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
}

@Article{Adhianto:2000:TOA,
  author =       "L. Adhianto and F. Bodin and B. Chapman and L. Hascoet
                 and A. Kneer and D. Lancaster and I. Wolton and M.
                 Wirtz",
  title =        "Tools for {OpenMP} application development: the {POST}
                 project",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1177--1191",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1177::AID-CPE533>3.0.CO;2-V",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500357/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500357&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Anonymous:2000:BRUd,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{Using MPI-2: Advanced
                 features of the message-passing interface}}: By William
                 Gropp, Ewing Lusk and Rajeev Thakur. The MIT Press,
                 Cambridge, MA. (1999). 382 pages. \$35 (each); \$60
                 (set)}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "40",
  number =       "2--3",
  pages =        "419--419",
  month =        jul # "\slash " # aug,
  year =         "2000",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:49:10 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122100902098",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Anonymous:2000:BRUe,
  author =       "Anonymous",
  title =        "Book Review: {{\booktitle{Using MPI: Portable parallel
                 programming with the message-passing interface}}:
                 Second edition. By William Gropp, Ewing Lusk and
                 Anthony Skjellum. The MIT Press, Cambridge, MA. (1999).
                 371 pages. \$35 (each); \$60 (set)}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "40",
  number =       "2--3",
  pages =        "419--419",
  month =        jul # "\slash " # aug,
  year =         "2000",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:49:10 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122100902074",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Armstrong:2000:QDB,
  author =       "Brian Armstrong and Seon Wook Kim and Rudolf
                 Eigenmann",
  title =        "Quantifying Differences between {OpenMP} and {MPI}
                 Using a Large-Scale Application Suite",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "482--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "file://sunset.math.utah.edu/a/suncore0/export/home/0073/sy/beebe/tex/bib/lncs2000.bib;
                 http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400482.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400482.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Astalos:2000:CMS,
  author =       "J{\'a}n Astalos and Ladislav Hluch{\'y}",
  title =        "{CIS} --- a Monitoring System for {PC} Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "225--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080225.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080225.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Baiardi:2000:AMM,
  author =       "Fabrizio Baiardi and Sarah Chiti and Paolo Mori and
                 Laura Ricci",
  title =        "Adaptive Multigrid Methods in {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "80--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080080.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080080.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Berrendorf:2000:PCO,
  author =       "Rudolf Berrendorf and Guido Nieken",
  title =        "Performance characteristics for {OpenMP} constructs on
                 different parallel computer architectures",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1261--1273",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1261::AID-CPE525>3.0.CO;2-5",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500355/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500355&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@InProceedings{Bircsak:2000:EONa,
  author =       "John Bircsak and Peter Craig and RaeLyn Crowell and
                 Zarka Cvetanovic and Jonathan Harris and C. Alexander
                 Nelson and Carl D. Offner",
  title =        "Extending {OpenMP} for {NUMA} Machines",
  crossref =     "ACM:2000:SHP",
  pages =        "68--69",
  year =         "2000",
  bibdate =      "Mon Feb 12 12:29:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap226.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Bircsak:2000:EONb,
  author =       "John Bircsak and Peter Craig and RaeLyn Crowell and
                 others",
  title =        "Extending {OpenMP} for {NUMA} machines",
  journal =      j-SCI-PROG,
  volume =       "8",
  number =       "3",
  pages =        "163--181",
  year =         "2000",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 08:44:35 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Bolloni:2000:TIQ,
  author =       "Alessandro Bolloni and Stefano Crocchianti and Antonio
                 Lagan{\`a}",
  title =        "Time Independent {$3$D} Quantum Reactive Scattering on
                 {MIMD} Parallel Computers",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "338--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080338.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080338.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bolton:2000:MPL,
  author =       "Hermanus P. J. Bolton and Jaco F. Schutte and Albert
                 A. Groenwold",
  title =        "Multiple Parallel Local Searches in Global
                 Optimization",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "88--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080088.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080088.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Booth:2000:SSM,
  author =       "S. Booth and E. Mourao",
  title =        "Single-sided {MPI} Implementations for {SUN MPI}",
  crossref =     "ACM:2000:SHP",
  pages =        "46--46",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:40 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap182.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Bova:2000:DLP,
  author =       "Steve W. Bova and Clay P. Breshears and Christine E.
                 Cuicchi and Zeki Demirbilek and Henry A. Gabb",
  title =        "Dual-Level Parallel Analysis of Harbor Wave Response
                 Using {MPI} and {OpenMP}",
  journal =      j-IJHPCA,
  volume =       "14",
  number =       "1",
  pages =        "49--64",
  month =        "Spring",
  year =         "2000",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Sep 12 12:39:11 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Brieger:2000:HOO,
  author =       "Leesa Brieger",
  title =        "{HPF} to {OpenMP} on the {Origin2000}: a case study",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1147--1154",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1147::AID-CPE526>3.0.CO;2-Q",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500351/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500351&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Brorsson:2000:SIE,
  author =       "Mats Brorsson and Barbara Chapman",
  title =        "Special Issue: {EWOMP'99 --- First European Workshop
                 on OpenMP}",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1117--1119",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1117::AID-CPE543>3.0.CO;2-#",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500352/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500352&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@InProceedings{Bruno:2000:PEH,
  author =       "G. Bruno and A. A. Chien and M. J. Katz and P. M.
                 Papadopoulos",
  title =        "Performance Enhancements for {HPVM} in Multi-Network
                 and Heterogeneous Hardware",
  crossref =     "Engquist:2000:SVG",
  pages =        "17--32",
  year =         "2000",
  bibdate =      "Mon Oct 23 10:53:54 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Brunschen:2000:OCP,
  author =       "Christian Brunschen and Mats Brorsson",
  title =        "{OdinMP\slash CCp} --- a portable implementation of
                 {OpenMP} for {C}",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1193--1203",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1193::AID-CPE527>3.0.CO;2-U",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500347/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500347&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Bubak:2000:IOB,
  author =       "Marian Bubak and W. odzimierz Funika and Bartosz Balis
                 and Roland Wism{\"u}ller",
  title =        "Interoperability of {OCM}-Based On-Line Tools",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "242--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080242.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080242.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Bull:2000:JOL,
  author =       "J. M. Bull and M. E. Kambites",
  editor =       "????",
  booktitle =    "{Proceedings of the ACM 2000 conference on Java
                 Grande}",
  title =        "{JOMP}: an {OpenMP}-like interface for {Java}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "44--53",
  year =         "2000",
  bibdate =      "Mon Oct 07 09:19:42 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bull:2000:PPJ,
  author =       "J. Mark Bull and Mark E. Kambites and Jan Obdrzalek",
  title =        "Parallel Programming in {Java} with {OpenMP}-like
                 Directives",
  crossref =     "ACM:2000:SHP",
  pages =        "150--150",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  acknowledgement = ack-nhfb,
}

@Article{Butler:2000:SPM,
  author =       "Ralph Butler and William Gropp and Ewing Lusk",
  title =        "A Scalable Process-Management Environment for Parallel
                 Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "168--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080168.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080168.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InCollection{Cahir:2000:PMM,
  author =       "Margaret Cahir and Robert Moench and Alice E.
                 Koniges",
  title =        "Programming Models and Methods",
  crossref =     "Koniges:2000:ISP",
  chapter =      "3",
  pages =        "27--54",
  year =         "2000",
  bibdate =      "Fri Feb 04 18:32:51 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Discusses PVM, MPI, SHMEM, High-Performance Fortran,
                 and POSIX threads.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Cappello:2000:MVM,
  author =       "Franck Cappello and Daniel Etiemble",
  title =        "{MPI} versus {MPI+OpenMP} on the {IBM SP} for the {NAS
                 Benchmarks}",
  crossref =     "ACM:2000:SHP",
  pages =        "51--51",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:42 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap214.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Carpenter:2000:MML,
  author =       "Bryan Carpenter and Vladimir Getov and Glenn Judd and
                 Anthony Skjellum and Geoffrey Fox",
  title =        "{MPJ}: {MPI}-like message passing for {Java}",
  journal =      j-CPE,
  volume =       "12",
  number =       "11",
  pages =        "1019--1038",
  month =        sep,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200009)12:11<1019::AID-CPE518>3.0.CO;2-G",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76000188/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76000188&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Carpenter:2000:OSM,
  author =       "Bryan Carpenter and Geoffrey Fox and Sung Hoon Ko and
                 Sang Lim",
  title =        "Object serialization for marshaling data in a {Java}
                 interface to {MPI}",
  journal =      j-CPE,
  volume =       "12",
  number =       "7",
  pages =        "539--553",
  month =        may,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200005)12:7<539::AID-CPE498>3.0.CO;2-H",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sun Oct 29 16:57:07 MST 2000",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/72516217/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=72516217&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@InProceedings{Cartwright:2000:AOE,
  author =       "Keith L. Cartwright and Joseph D. Blahovec",
  title =        "Adding {OpenMP} to an Existing {MPI} Code: Will It be
                 Beneficial?",
  crossref =     "ACM:2000:SHP",
  pages =        "145--145",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Chen:2000:MCO,
  author =       "Hsiang Ann Chen and Yvette O. Carrasco and Amy W.
                 Apon",
  title =        "{MPI} Collective Operations over {IP} Multicast",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "51--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000051.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18000051.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ciaccio:2000:GMG,
  author =       "Giuseppe Ciaccio and Giovanni Chiola",
  title =        "{GAMMA} and {MPI\slash GAMMA} on Gigabit {Ethernet}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "129--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080129.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080129.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cotronis:2000:CMP,
  author =       "J. Y. Cotronis and Z. Tsiatsoulis and C. Kouniakis",
  title =        "Composition of Message Passing Applications
                 On-Demand",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "192--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080192.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080192.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Couturier:2000:PMD,
  author =       "Rapha{\"e}l Couturier and Christophe Chipot",
  title =        "Parallel molecular dynamics using {OpenMP} on a shared
                 memory machine",
  journal =      j-COMP-PHYS-COMM,
  volume =       "124",
  number =       "1",
  pages =        "49--59",
  day =          "15",
  month =        jan,
  year =         "2000",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(99)00432-4",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:40:32 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465599004324",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Domokos:2000:PRC,
  author =       "G{\'a}bor Domokos and Imre Szeber{\'e}nyi and Paul H.
                 Steen",
  title =        "Parallel, Recursive Computation of Global Stability
                 Charts for Liquid Bridges",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "64--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080064.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080064.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dozsa:2000:THL,
  author =       "G{\'a}bor D{\'o}zsa and D{\'a}niel Dr{\'o}tos and
                 R{\'o}bert Lovas",
  title =        "Translation of a High-Level Graphical Code to
                 Message-Passing Primitives in the {GRADE} Programming
                 Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "258--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080258.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080258.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Eigenmann:2000:TMPa,
  author =       "Rudolf Eigenmann and Tim Mattson",
  title =        "Tutorial {M6A}: Parallel Programming with {OpenMP}:
                 {Part I}",
  crossref =     "ACM:2000:SHP",
  pages =        "21--21",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Eigenmann:2000:TMPb,
  author =       "Rudolf Eigenmann and Tim Mattson",
  title =        "Tutorial {M6B}: Parallel Programming with {OpenMP}:
                 {Part II}",
  crossref =     "ACM:2000:SHP",
  pages =        "23--23",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  acknowledgement = ack-nhfb,
}

@Article{Espinosa:2000:APA,
  author =       "Antonio Espinosa and Tomas Margalef and Emilio Luque",
  title =        "Automatic Performance Analysis of Master\slash Worker
                 {PVM} Applications with {Kpi}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "47--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080047.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080047.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fagg:2000:AAC,
  author =       "Graham E. Fagg and Sathish S. Vadhiyar and Jack J.
                 Dongarra",
  title =        "{ACCT}: {Automatic Collective Communications Tuning}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "354--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080354.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080354.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fagg:2000:FMF,
  author =       "Graham E. Fagg and Jack J. Dongarra",
  title =        "{FT-MPI}: {Fault Tolerant MPI}, Supporting Dynamic
                 Applications in a Dynamic World",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "346--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080346.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080346.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fahringer:2000:FOP,
  author =       "Thomas Fahringer and Michael Gerndt and Graham Riley
                 and Jesper Larsson Tr{\"a}ff",
  title =        "Formalizing {OpenMP} Performance Properties with
                 {ASL}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "428--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400428.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400428.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fernandez:2000:DCE,
  author =       "Francisco Fern{\'a}ndez and Marco Tomassini and
                 Leonardo Vanneschi and Laurent Bucher",
  title =        "A Distributed Computing Environment for Genetic
                 Programming Using {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "322--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080322.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080322.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fernandez:2000:UPM,
  author =       "Gustavo J. Fern{\'a}ndez and Julio Jacobo-Berlles and
                 Patricia Borensztejn and Marisa Bauz{\'a} and Marta
                 Mejail",
  title =        "Use of {PVM} for {MAP} Image Restoration: a Parallel
                 Implementation of the {ARTUR} Algorithm",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "113--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080113.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080113.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fink:2000:IMC,
  author =       "Torsten Fink",
  title =        "Integrating {MPI} Components into Metacomputing
                 Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "208--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080208.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080208.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Geist:2000:PMW,
  author =       "Al Geist",
  title =        "{PVM} and {MPI}: What Else Is Needed for Cluster
                 Computing?",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "1--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080001.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080001.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Girona:2000:VDC,
  author =       "Sergi Girona and Jes{\'u}s Labarta and Rosa M. Badia",
  title =        "Validation of Dimemas Communication Model for {MPI}
                 Collective Operations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "39--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080039.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080039.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Gonzalez:2000:AIT,
  author =       "M. Gonzalez and A. Serra and X. Martorell and J.
                 Oliver and E. Ayguade and J. Labarta and N. Navarro",
  editor =       "????",
  booktitle =    "{Proceedings 14th International Parallel and
                 Distributed Processing Symposium. IPDPS 2000}",
  title =        "Applying interposition techniques for performance
                 analysis of {OpenMP} parallel applications",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "235--240",
  year =         "2000",
  bibdate =      "Mon Oct 07 09:07:07 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Gonzalez:2000:NSF,
  author =       "Marc Gonz{\`a}lez and Eduard Ayguad{\'e} and Xavier
                 Martorell and Jes{\'u}s Labarta and Nacho Navarro and
                 Jos{\'e} Oliver",
  title =        "{NanosCompiler}: supporting flexible multilevel
                 parallelism exploitation in {OpenMP}",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1205--1218",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1205::AID-CPE524>3.0.CO;2-2",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500358/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500358&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Gonzalez:2000:PAM,
  author =       "Daniel Gonz{\'a}lez and Francisco Almeida and Luz
                 Marina Moreno and Casiano Rodr{\'\i}guez",
  title =        "Pipeline Algorithms on {MPI}: Optimal Mapping of the
                 Path Planing Problem",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "104--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080104.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080104.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gonzalez:2000:TSN,
  author =       "J. A. Gonz{\'a}lez and C. Le{\'o}n and F. Piccoli and
                 M. Printista and J. L. Roda and C. Rodr{\'\i}guez and
                 F. Sande",
  title =        "Towards Standard Nested Parallelism",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "96--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080096.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080096.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2000:RCD,
  author =       "William D. Gropp",
  title =        "Runtime Checking of Datatype Signatures in {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "160--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080160.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080160.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Gropp:2000:TSU,
  author =       "William Gropp and Ewing (Rusty) Lusk and Rajeev S.
                 Thakur",
  title =        "Tutorial {S1}: Using {MPI-2}: a Tutorial on Advanced
                 Features of the Message-Passing Interface",
  crossref =     "ACM:2000:SHP",
  pages =        "11--11",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Book{Gropp:2000:UMA,
  author =       "William Gropp and Ewing Lusk and Rajeev Thakur",
  title =        "Using {MPI-2}: Advanced Features of the {Message
                 Passing Interface}",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xxi + 382",
  year =         "2000",
  ISBN =         "0-262-57133-1",
  ISBN-13 =      "978-0-262-57133-3",
  LCCN =         "QA76.642 .G762 1999",
  bibdate =      "Wed Aug 27 06:19:05 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Scientific and engineering computation",
  acknowledgement = ack-nhfb,
}

@InProceedings{He:2000:PAA,
  author =       "Yun (Helen) He and Chris H. Q. Ding",
  title =        "Platforms: An Accurate Arithmetics Approach",
  crossref =     "ACM:2000:SHP",
  pages =        "150--150",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  abstract =     "Numerical reproducibility of large-scale scientific
                 simulations, especially climate modeling, on
                 distributed memory parallel computers are becoming
                 critical issues. In particular, global summation and
                 dot products of distributed arrays are very susceptible
                 to rounding errors. We analyzed several accurate
                 summation methods and found that two methods are
                 particularly effective to improve (ensure)
                 reproducibility: Kahan's self-compensated summation and
                 Bailey's double-double precision summation. We provide
                 an MPI operator MPI\_SUMDD to work with MPI collective
                 operations to ensure a scalable implementation on large
                 number of processors. The final methods are
                 particularly simple to adopt in practical codes.",
  acknowledgement = ack-nhfb,
  keywords =     "floating-point arithmetic; rounding errors",
}

@InProceedings{He:2000:UAA,
  author =       "Yun He and Chris H. Q. Ding",
  title =        "Using accurate arithmetics to improve numerical
                 reproducibility and stability in parallel
                 applications",
  crossref =     "Reynders:2000:IPI",
  pages =        "225--234",
  year =         "2000",
  DOI =          "https://doi.org/10.1145/335231.335253",
  bibdate =      "Sat Feb 8 18:35:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/335231.335253",
  abstract =     "Numerical reproducibility and stability of large scale
                 scientific simulations, especially climate modeling, on
                 distributed memory parallel computers are becoming
                 critical issues. In particular, global summation of
                 distributed arrays is most susceptible to rounding
                 errors, and their propagation and accumulation cause
                 uncertainty in final simulation results. We analyzed
                 several accurate summation methods and found that two
                 methods are particularly effective to improve (ensure)
                 reproducibility and stability: Kahan's self-compensated
                 summation and Bailey's double-double precision
                 summation. We provide an MPI operator MPLSUMDD to work
                 with MPI collective operations to ensure a scalable
                 implementation on large number of processors. The final
                 methods are particularly simple to adopt in practical
                 codes.",
  acknowledgement = ack-nhfb,
}

@Article{Hisley:2000:PPE,
  author =       "Dixie Hisley and Gagan Agrawal and Punyam
                 Satya-narayana and Lori Pollock",
  title =        "Porting and performance evaluation of irregular codes
                 using {OpenMP}",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1241--1259",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1241::AID-CPE523>3.0.CO;2-D",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500349/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500349&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Hu:2000:ONS,
  author =       "Y. Charlie Hu and Honghui Lu and Alan L. Cox and Willy
                 Zwaenepoel",
  title =        "{OpenMP} for Networks of {SMPs}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "60",
  number =       "12",
  pages =        "1512--1530",
  day =          "1",
  month =        dec,
  year =         "2000",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.2000.1658",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Tue Jul 17 08:06:43 MDT 2001",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1658;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1658/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1658/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Huse:2000:MOS,
  author =       "Lars Paul Huse",
  title =        "{MPI} Optimization for {SMP} Based Clusters
                 Interconnected with {SCI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "56--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080056.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080056.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Imamura:2000:ASM,
  author =       "Toshiyuki Imamura and Yuichi Tsujita and Hiroshi Koide
                 and Hiroshi Takemiya",
  title =        "An Architecture of {Stampi}: {MPI} Library on a
                 Cluster of Parallel Computers",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "200--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080200.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080200.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ishizaka:2000:CGT,
  author =       "Kazuhisa Ishizaka and Motoki Obata and Hironori
                 Kasahara",
  title =        "Coarse-Grain Task Parallel Processing Using the
                 {OpenMP} Backend of the {OSCAR} Multigrain
                 Parallelizing Compiler",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "457--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400457.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400457.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Iskra:2000:IDE,
  author =       "K. A. Iskra and F. van der Linden and Z. W. Hendrikse
                 and B. J. Overeinder and G. D. van Albada and P. M. A.
                 Sloot",
  title =        "The implementation of dynamite: an environment for
                 migrating {PVM} tasks",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "3",
  pages =        "40--55",
  month =        jul,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:47 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Iskra:2000:PMD,
  author =       "K. A. Iskra and Z. W. Hendrikse and G. D. van Albada
                 and B. J. Overeinder and P. M. A. Sloot",
  title =        "Performance Measurements on {Dynamite\slash DPVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "27--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080027.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080027.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Jin:2000:AGO,
  author =       "Haoqiang Jin and Michael Frumkin and Jerry Yan",
  title =        "Automatic Generation of {OpenMP} Directives and Its
                 Application to Computational Fluid Dynamics Codes",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "440--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400440.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400440.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kuhn:2000:OVT,
  author =       "Bob Kuhn and Paul Petersen and Eamonn O'Toole",
  title =        "{OpenMP} versus threading in {C\slash C++}",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1165--1176",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1165::AID-CPE529>3.0.CO;2-L",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500354/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500354&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Kusano:2000:PEO,
  author =       "Kazuhiro Kusano and Shigehisa Satoh and Mitsuhisa
                 Sato",
  title =        "Performance Evaluation of the Omni {OpenMP} Compiler",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "403--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400403.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400403.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Landman:2000:PLR,
  author =       "Joseph Landman and Piotr Piecuch",
  title =        "Parallelization of a legacy research program using
                 {OpenMP}",
  journal =      j-FORTRAN-FORUM,
  volume =       "19",
  number =       "2",
  pages =        "16--23",
  month =        aug,
  year =         "2000",
  CODEN =        "????",
  ISSN =         "1061-7264 (print), 1931-1311 (electronic)",
  ISSN-L =       "1061-7264",
  bibdate =      "Wed Feb 6 18:50:08 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran-forum.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Fortran Forum",
}

@Article{Laohawee:2000:PDT,
  author =       "P. Laohawee and A. Tangpong and A. Rungsawang",
  title =        "Parallel {DSIR} Text Indexing System: Using Multiple
                 Master\slash Slave Concept",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "297--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080297.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080297.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lassous:2000:HGA,
  author =       "Isabelle Gu{\'e}rin Lassous and Jens Gustedt and
                 Michel Morvan",
  title =        "Handling Graphs According to a Coarse Grained
                 Approach: Experiments with {PVM} and {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "72--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080072.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080072.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Livny:2000:MYW,
  author =       "Miron Livny",
  title =        "Managing Your Workforce on a Computational Grid",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "3--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080003.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080003.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Louca:2000:MFP,
  author =       "S. Louca and N. Neophytou and A. Lachanas and P.
                 Evripidou",
  title =        "{MPI-FT}: Portable Fault Tolerance Scheme for {MPI}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "10",
  number =       "4",
  pages =        "371--??",
  month =        dec,
  year =         "2000",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Wed Jul 25 16:34:42 2001",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/ppl.shtml;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ejournals.wspc.com.sg/ppl/10/1004/S0129626400000342.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Lusk:2000:IIC,
  author =       "Ewing Lusk",
  title =        "Isolating and Interfacing the Components of a Parallel
                 Computing Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "5--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080005.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080005.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Mattson:2000:BOF,
  author =       "Tim Mattson",
  title =        "{BOF}: {OpenMP} and its Future Developments",
  crossref =     "ACM:2000:SHP",
  pages =        "106--106",
  year =         "2000",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2000.bib",
  acknowledgement = ack-nhfb,
}

@Article{Mattson:2000:IO,
  author =       "Timothy G. Mattson",
  title =        "An Introduction to {OpenMP 2.0}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "384--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400384.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400384.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mazzocca:2000:TPP,
  author =       "N. Mazzocca and M. Rak and U. Villano",
  title =        "The Transition from a {PVM} Program Simulator to a
                 Heterogeneous System Simulator: The {HeSSE} Project",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "266--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080266.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080266.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{McDonald:2000:TPA,
  author =       "Chris McDonald and Kamran Kazemi",
  title =        "Teaching parallel algorithm with process topologies",
  journal =      j-SIGCSE,
  volume =       "32",
  number =       "1",
  pages =        "70--74",
  month =        mar,
  year =         "2000",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/331795.331816",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Mon Nov 19 10:05:03 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  abstract =     "Parallel algorithms are often introduced to students
                 by describing the geometric topologies formed by
                 communicating processes and often the geographic
                 relationships between them. However, the two most
                 common message passing environments used in teaching,
                 PVM and MPI, each provide only rudimentary support for
                 the specification and execution of process topologies.
                 There is a strong need for better syntactic and
                 semantic support for process topologies in these
                 environments, so that students may concentrate on the
                 algorithms being studied, and not have to wrestle with
                 the environments' infrastructure. This paper first
                 motivates, and then describes the use of additional
                 support within PVM and MPI which addresses this need.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Mierendorff:2000:WMB,
  author =       "Hermann Mierendorff and Kl{\"a}re Cassirer and Helmut
                 Schwamborn",
  title =        "Working with {MPI} Benchmarking Suites on {ccNUMA}
                 Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "18--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080018.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080018.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Migliardi:2000:SFT,
  author =       "Mauro Migliardi and Vaidy Sunderam and Arrigo
                 Frisiani",
  title =        "A Simple, Fault Tolerant Naming Space for the
                 {HARNESS} Metacomputing System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "152--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080152.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080152.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mourao:2000:SSC,
  author =       "Elson Mour{\~a}o and Stephen Booth",
  title =        "Single Sided Communications in Multi-protocol {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "176--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080176.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080176.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Neyman:2000:CDA,
  author =       "Marcin Neyman",
  title =        "Comparison of Different Approaches to Trace {PVM}
                 Program Execution",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "274--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080274.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080274.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Nikolopoulos:2000:DDN,
  author =       "Dimitrios S. Nikolopoulos and Theodore S.
                 Papatheodorou and Constantine D. Polychronopoulos and
                 Jesus Labarta and Eduard Ayguade",
  title =        "Is Data Distribution Necessary in {OpenMP}?",
  crossref =     "ACM:2000:SHP",
  pages =        "68--68",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:45 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap192.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Nikolopoulos:2000:LTD,
  author =       "Dimitrios S. Nikolopoulos and Theodore S.
                 Papatheodorou and Constantine D. Polychronopoulos and
                 Jes{\'u}s Labarta and Eduard Ayguad{\'e}",
  title =        "Leveraging Transparent Data Distribution in {OpenMP}
                 via User-Level Dynamic Page Migration",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "415--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400415.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400415.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nikolopoulos:2000:TRD,
  author =       "Dimitrios S. Nikolopoulos and Theodore S.
                 Papatheodorou and Constantine D. Polychronopoulos and
                 others",
  title =        "A transparent runtime data distribution engine for
                 {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "8",
  number =       "3",
  pages =        "143--162",
  year =         "2000",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 08:44:35 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Nikolopoulos:2000:ULR,
  author =       "Dimitrios S. Nikolopoulos and Theodore S.
                 Papatheodorou and Constantine D. Polychronopoulos and
                 Jes{\'u}s Labarta and Eduard Ayguad{\'e}",
  title =        "{UPM LIB}: a Runtime System for Tuning the Memory
                 Performance of {OpenMP} Programs on Scalable
                 Shared-Memory Multiprocessors",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1915",
  pages =        "85--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:08:51 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1915.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1915/19150085.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1915/19150085.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nishitani:2000:IEO,
  author =       "Yasunori Nishitani and Kiyoshi Negishi and Hiroshi
                 Ohta and Eiji Nunohiro",
  title =        "Implementation and Evaluation of {OpenMP} for {Hitachi
                 SR8000}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "391--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400391.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400391.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nitsche:2000:TCM,
  author =       "Thomas Nitsche",
  title =        "Thread Communication over {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "145--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080145.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080145.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Ong:2000:PCL,
  author =       "Hong Ong and Paul A. Farrell",
  title =        "Performance Comparison of {LAM\slash MPI}, {MPICH},
                 and {MVICH} on a {Linux} Cluster Connected by a
                 {Gigabit Ethernet} Network",
  crossref =     "USENIX:2000:PAL",
  pages =        "??--??",
  year =         "2000",
  bibdate =      "Wed Oct 16 05:17:16 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/als2000/ong.html",
  acknowledgement = ack-nhfb,
}

@Article{Orlando:2000:MDT,
  author =       "S. Orlando and P. Palmerini and R. Perego",
  title =        "Mixed data and task parallelism with {HPF} and {PVM}",
  journal =      "Cluster Computing",
  volume =       "3",
  number =       "3",
  publisher =    "Kluwer Academic Publishers, Boston, U.S.A",
  pages =        "201--213",
  year =         "2000",
  CODEN =        "????",
  ISSN =         "1386-7857",
  bibdate =      "Sat Dec 7 09:42:43 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; Ingenta
                 database",
  acknowledgement = ack-nhfb,
  pagecount =    "13",
}

@Article{Payrits:2000:UPC,
  author =       "Szabolcs Payrits and Zolt{\'a}n Szatm{\'a}ry and
                 L{\'a}szl{\'o} Zal{\'a}nyi and P{\'e}ter {\'E}rdi",
  title =        "Use of Parallel Computers in Neurocomputing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "313--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080313.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080313.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pedroso:2000:MPC,
  author =       "Hern{\^a}ni Pedroso and Jo{\~a}o Gabriel Silva",
  title =        "{MPI-2} Process Creation \& Management Implementation
                 for {NT} Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "184--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080184.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080184.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Petcu:2000:PDAa,
  author =       "Dana Petcu",
  title =        "{PVMaple}: a Distributed Approach to Cooperative Work
                 of {Maple} Processes",
  type =         "Technical report",
  institution =  "Westers University of Timisoara",
  address =      "Timisoara, Romania",
  month =        may,
  year =         "2000",
  bibdate =      "Wed Dec 17 18:08:30 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.risc.uni-linz.ac.at/software/distmaple/index_1.html",
  URL =          "http://www.risc.uni-linz.ac.at/software/distmaple/misc/PVMaple.ps.gz",
  acknowledgement = ack-nhfb,
  keywords =     "Distributed Maple; PVMaple",
}

@Article{Petcu:2000:PDAb,
  author =       "Dana Petcu",
  title =        "{PVMaple}: a Distributed Approach to Cooperative Work
                 of {Maple} Processes",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "216--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080216.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080216.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Plazek:2000:SCC,
  author =       "Joanna P{\l}azek and Krzysztof Banas and Jacek
                 Kitowski",
  title =        "Scalable {CFD} Computations Using Message-Passing and
                 Distributed Shared Memory Algorithms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "282--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080282.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080282.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Protopopov:2000:SMC,
  author =       "Boris V. Protopopov and Anthony Skjellum",
  title =        "Shared-memory communication approaches for an {MPI}
                 message-passing library",
  journal =      j-CPE,
  volume =       "12",
  number =       "9",
  pages =        "799--820",
  day =          "10",
  month =        aug,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(20000810)12:9<799::AID-CPE476>3.0.CO;2-1",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sun Oct 29 16:57:07 MST 2000",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/72516482/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=72516482&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Quoy:2000:PNN,
  author =       "Mathias Quoy and Sorin Moga and Philippe Gaussier and
                 Arnaud Revel",
  title =        "Parallelization of Neural Networks Using {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "289--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080289.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080289.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Rabaea:2000:EPM,
  author =       "Adrian Rabaea and Monica Rabaea",
  title =        "Experiments with Parallel {Monte Carlo} Simulation for
                 Pricing Options Using {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "330--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080330.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080330.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Reussner:2000:BMD,
  author =       "Ralf Reussner and Jesper Larsson Tr{\"a}ff and Gunnar
                 Hunzelmann",
  title =        "A Benchmark for {MPI} Derived Datatypes",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "10--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080010.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080010.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Rohrl:2000:PPS,
  author =       "Armin R{\"o}hrl",
  title =        "Parallel processing in statistical computation: {BSP},
                 {FPGas} and {MPI} for the {S}-language",
  type =         "Th{\`e}se sciences",
  school =       "EPF Lausanne",
  address =      "Lausanne, Switzerland",
  pages =        "137",
  year =         "2000",
  bibdate =      "Wed Aug 27 07:24:45 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Roy:2000:MGQ,
  author =       "Alain J. Roy and Ian Foster and William Gropp and
                 Nicholas Karonis and Volker Sander and Brian Toonen",
  title =        "{MPICH-GQ}: Quality-of-Service for Message Passing
                 Programs",
  crossref =     "ACM:2000:SHP",
  pages =        "54--54",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:43 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap234.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Scherer:2000:APO,
  author =       "Alex Scherer and Thomas Gross and Willy Zwaenepoel",
  title =        "Adaptive Parallelism for {OpenMP} Task Parallel
                 Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1915",
  pages =        "113--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:08:51 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1915.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1915/19150113.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1915/19150113.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shah:2000:FCS,
  author =       "Sanjiv Shah and Grant Haab and Paul Petersen and Joe
                 Throop",
  title =        "Flexible control structures for parallelism in
                 {OpenMP}",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1219--1239",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1219::AID-CPE530>3.0.CO;2-0",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500348/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500348&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Shyu:2000:APV,
  author =       "Shyong-Jian Shyu and B. M. T. Lin",
  title =        "An application of parallel virtual machine framework
                 to film production problem",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "39",
  number =       "12",
  pages =        "53--62",
  month =        jun,
  year =         "2000",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:49:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122100001292",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Silva:2000:HPC,
  author =       "Lu{\'\i}s Moura Silva and Paulo Martins and Jo{\~a}o
                 Gabriel Silva",
  title =        "Heterogeneous parallel computing using {Java} and
                 {WMPI}",
  journal =      j-CPE,
  volume =       "12",
  number =       "11",
  pages =        "1077--1091",
  month =        sep,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200009)12:11<1077::AID-CPE521>3.0.CO;2-#",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76000189/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76000189&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Smith:2000:DPM,
  author =       "Lorna Smith and Paul Kent",
  title =        "Development and performance of a mixed {OpenMP\slash
                 MPI} quantum {Monte Carlo} code",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1121--1129",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1121::AID-CPE531>3.0.CO;2-N",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500350/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500350&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Solsona:2000:MCM,
  author =       "Francesc Solsona and Francesc Gin{\'e} and Josep
                 L{\'e}rida and Porfidio Hern{\'a}ndez and Emilio
                 Luque",
  title =        "{Monito}: a Communication Monitoring Tool for a
                 {PVM--Linux} Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "233--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080233.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080233.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sosa:2000:IQC,
  author =       "C. P. Sosa and G. Scalmani and R. Gomperts and M. J.
                 Frisch",
  title =        "Ab initio quantum chemistry on a {ccNUMA} architecture
                 using {openMP}. {III}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "26",
  number =       "7--8",
  pages =        "843--856",
  month =        jul,
  year =         "2000",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Oct 28 17:44:32 MDT 2000",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.nl/gej-ng/10/35/21/42/29/25/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/42/29/25/article.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Sterling:2000:SCB,
  author =       "Thomas Sterling",
  title =        "Symbolic Computing with {Beowulf}-Class {PC}
                 Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "7--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080007.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080007.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Suppi:2000:IOP,
  author =       "Remo Suppi and Fernando Cores and Emilio Luque",
  title =        "Improving Optimistic {PDES} in {PVM} Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "304--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080304.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080304.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tanaka:2000:PEO,
  author =       "Yoshizumi Tanaka and Kenjiro Taura and Mitsuhisa Sato
                 and Akinori Yonezawa",
  title =        "Performance Evaluation of {OpenMP} Applications with
                 Nested Parallelism",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1915",
  pages =        "100--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:08:51 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1915.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1915/19150100.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1915/19150100.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tang:2000:PTR,
  author =       "Hong Tang and Kai Shen and Tao Yang",
  title =        "Program transformation and runtime support for
                 threaded {MPI} execution on shared-memory machines",
  journal =      j-TOPLAS,
  volume =       "22",
  number =       "4",
  pages =        "673--700",
  year =         "2000",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Apr 17 10:05:24 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/toplas/2000-22-4/p673-tang/",
  abstract =     "Parallel programs written in MPI have been widely used
                 for developing high-performance applications on various
                 platforms. Because of a restriction of the MPI
                 computation model, conventional MPI implementations on
                 shared-memory machines map each MPI node to an OS
                 process, which can suffer serious performance
                 degradation in the presence of multiprogramming. This
                 paper studies compile-time and runtime techniques for
                 enhancing performance portability of MPI code running
                 on multiprogrammed shared-memory machines. The proposed
                 techniques allow MPI nodes to be executed safety and
                 efficiently as threads. Compile-time transformation
                 eliminates global and static variables in C code using
                 node-specific data. The runtime support includes an
                 efficient and provably correct communication protocol
                 that uses lock-free data structure and takes advantage
                 of address space sharing among threads. The experiments
                 on SGI Origin 2000 show that our MPI prototype called
                 TMPI using the proposed techniques is competitive with
                 SGI's native MPI implementation in a dedicated
                 environment, and that it has significant performance
                 advantages in a multiprogrammed environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  generalterms = "Algorithms; Design; Experimentation; Languages;
                 Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "lock-free synchronization; MPI; multiprogrammed
                 environments; program transformation; shared-memory
                 machines; threaded execution",
  subject =      "Hardware --- Memory Structures --- Design Styles
                 (B.3.2): {\bf Shared memory}; Software --- Programming
                 Techniques --- Concurrent Programming (D.1.3): {\bf
                 Parallel programming}; Software --- Programming
                 Languages --- Language Classifications (D.3.2): {\bf
                 Concurrent, distributed, and parallel languages};
                 Software --- Programming Languages --- Processors
                 (D.3.4): {\bf Preprocessors}; Software --- Programming
                 Languages --- Processors (D.3.4): {\bf Run-time
                 environments}; Software --- Operating Systems ---
                 Process Management (D.4.1): {\bf
                 Multiprocessing/multiprogramming/multitasking}; Data
                 --- Data Structures (E.1): {\bf Lists, stacks, and
                 queues}",
}

@Article{Tatebe:2000:IOO,
  author =       "Osamu Tatebe and Mitsuhisa Sato and Satoshi
                 Sekiguchi",
  title =        "Impact of {OpenMP} Optimizations for the {MGCG}
                 Method",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "471--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400471.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400471.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tavora:2000:DCM,
  author =       "V{\'\i}tor N. T{\'a}vora and Lu{\'\i}s M. Silva and
                 Jo{\~a}o Gabriel Silva",
  title =        "Distributed Checkpointing Mechanism for a Parallel
                 File System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "137--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080137.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080137.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Thiruvathukal:2000:JNW,
  author =       "George K. Thiruvathukal and Phillip M. Dickens and
                 Shahzad Bhatti",
  title =        "{Java} on networks of workstations {(JavaNOW)}: a
                 parallel computing framework inspired by {Linda} and
                 the {Message Passing Interface (MPI)}",
  journal =      j-CPE,
  volume =       "12",
  number =       "11",
  pages =        "1093--1116",
  month =        sep,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200009)12:11<1093::AID-CPE522>3.0.CO;2-6",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76000187/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76000187&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Tourancheau:2000:HSN,
  author =       "Bernard Tourancheau",
  title =        "High Speed Networks for Clusters, the {BIP-Myrinet}
                 Experience",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "9--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080009.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080009.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Traff:2000:IMO,
  author =       "Jesper Larsson Traff and Hubert Ritzdorf and Rolf
                 Hempel",
  title =        "The Implementation of {MPI-2} One-Sided Communication
                 for the {NEC SX-5}",
  crossref =     "ACM:2000:SHP",
  pages =        "45--46",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:32 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap181.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Tran:2000:PPM,
  author =       "Viet D. Tran and Ladislav Hluchy and Giang T. Nguyen",
  title =        "Parallel Program Model for Distributed Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "250--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080250.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080250.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{VanVoorst:2000:CMI,
  author =       "Brian {Van Voorst} and Steven Seidel",
  title =        "Comparison of {MPI} Implementations on a Shared Memory
                 Machine",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "847--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000847.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18000847.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Varin:2000:PAL,
  author =       "E. Varin and R. Roy and G. Samba",
  title =        "Parallel Algorithms for the Least-Squares Finite
                 Element Solution of the Neutron Transport Equation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1908",
  pages =        "121--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:30:27 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1908.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1908/19080121.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1908/19080121.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Vetter:2000:DST,
  author =       "Jeffrey S. Vetter and Bronis R. de Supinski",
  title =        "Dynamic Software Testing of {MPI} Applications with
                 {Umpire}",
  crossref =     "ACM:2000:SHP",
  pages =        "70--70",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:45 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap208.pdf",
  acknowledgement = ack-nhfb,
}

@TechReport{VidalMacia:2000:IPM,
  author =       "Antonio {Vidal Maci{\'a}} and Jos{\'e} Luis {P{\'e}rez
                 G{\'o}mez}",
  title =        "Introducci{\'o}n a la programaci{\'o}n en {MPI}.
                 ({Spanish}) [{Introduction} to programming in {MPI}]",
  type =         "Technical report",
  number =       "{SPUPV-2000.209}",
  institution =  "Departamento de Sistemas Inform{\'a}ticos y
                 Computaci{\'o}n, Facultad de Inform{\'a}tica,
                 Universidad Polit{\'e}cnica de Valencia, Servicio de
                 Publicaciones",
  address =      "Valencia, Spain",
  pages =        "78",
  year =         "2000",
  bibdate =      "Wed Aug 27 06:35:39 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  language =     "Spanish",
}

@Article{Wallcraft:2000:SOV,
  author =       "Alan J. Wallcraft",
  title =        "{SPMD} {OpenMP} versus {MPI} for ocean models",
  journal =      j-CPE,
  volume =       "12",
  number =       "12",
  pages =        "1155--1164",
  month =        oct,
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(200010)12:12<1155::AID-CPE532>3.0.CO;2-5",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Sat Apr 7 06:56:10 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76500353/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76500353&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Addison:2001:EOP,
  author =       "Cliff Addison",
  title =        "Exploiting {OpenMP} to Provide Scalable {SMP BLAS} and
                 {LAPACK} Routines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2073",
  pages =        "3--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:28 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2073.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2073/20730003.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2073/20730003.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Al-Tawil:2001:PME,
  author =       "Khalid Al-Tawil and Csaba Andras Moritz",
  title =        "Performance Modeling and Evaluation of {MPI}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "61",
  number =       "2",
  pages =        "202--223",
  day =          "1",
  month =        feb,
  year =         "2001",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.2000.1677",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Feb 22 15:30:36 MST 2002",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1677;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1677/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1677/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{AlHaddad:2001:UNW,
  author =       "Mohammed {Al Haddad} and Jerome Robinson",
  title =        "Using a Network of Workstations to Enhance Database
                 Query Processing Performance",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "352--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310352.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310352.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Allsopp:2001:EUM,
  author =       "Nicholas K. Allsopp and John F. Hague and Jean-Pierre
                 Prost",
  title =        "Experiences in Using {MPI--IO} on Top of {GPFS} for
                 the {IFS} Weather Forecast Code",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2150",
  pages =        "380--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:53 MST 2002",
  bibsource =    "file://sunset.math.utah.edu/a/suncore0/export/home/0073/sy/beebe/tex/bib/lncs2001c.bib;
                 http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500380.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2150/21500380.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Angskun:2001:DPM,
  author =       "Thara Angskun and Putchong Uthayopas and Arnon
                 Rungsawang",
  title =        "Dynamic Process Management in {KSIX} Cluster
                 Middleware",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "209--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310209.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310209.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Anonymous:2001:AAL,
  author =       "Anonymous",
  title =        "Appendixes: Appendix {A}: {Linux}, {Windows NT},
                 {AIX}, {Solaris}; Appendix {B}: Compilers and
                 Preprocessors, {MPI} Implementations, Development
                 Environments, Debuggers, Performance Analyzers",
  journal =      j-IJHPCA,
  volume =       "15",
  number =       "2",
  pages =        "191--194",
  month =        "Summer",
  year =         "2001",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/109434200101500213",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue May 01 05:27:17 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/pdf/10.1177/109434200101500213",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  xxmonth =      may,
}

@Article{Anonymous:2001:EDP,
  author =       "Anonymous",
  title =        "Erratum: Design and Prototype of a Performance Tool
                 Interface for {OpenMP}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "23",
  number =       "1",
  pages =        "105--128",
  month =        may,
  year =         "2001",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1023/A:1015741304337",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 6 12:13:23 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=23&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=23&issue=1&spage=105",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Baiardi:2001:CRD,
  author =       "Fabrizio Baiardi and Paolo Mori and Laura Ricci",
  title =        "Collecting Remote Data in Irregular Problems with
                 Hierarchical Representation of the Domain",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "304--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310304.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310304.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Banikazemi:2001:MLE,
  author =       "Mohammad Banikazemi and Rama K. Govindaraju and Robert
                 Blackmore and Dhabaleswar K. Panda",
  title =        "{MPI-LAPI}: An Efficient Implementation of {MPI} for
                 {IBM RS\slash 6000 SP} Systems",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "12",
  number =       "10",
  pages =        "1081--1093",
  month =        oct,
  year =         "2001",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.963419",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Sat Feb 23 09:26:03 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/td/books/td2001/pdf/l1081.pdf;
                 http://www.computer.org/tpds/td2001/l1081abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Baptista:2001:IOS,
  author =       "Tiago Baptista and Hernani Pedroso and Jo{\~a}o
                 Gabriel Silva",
  title =        "The Implementation of One-Sided Communications for
                 {WMPI II}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "61--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310061.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310061.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bencheva:2001:MPI,
  author =       "G. Bencheva",
  title =        "{MPI} Parallel Implementation of a Fast Separable
                 Solver",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2179",
  pages =        "454--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:06:22 MST 2002",
  bibsource =    "file://sunset.math.utah.edu/a/suncore0/export/home/0073/sy/beebe/tex/bib/lncs2001c.bib;
                 http://link.springer-ny.com/link/service/series/0558/tocs/t2179.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2179/21790454.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2179/21790454.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Berthou:2001:COH,
  author =       "Jean-Yves Berthou and Eric Fayolle",
  title =        "Comparing {OpenMP}, {HPF}, and {MPI} Programming: a
                 Study Case",
  journal =      j-IJHPCA,
  volume =       "15",
  number =       "3",
  pages =        "297--309",
  month =        "Fall",
  year =         "2001",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Mon Nov 05 16:09:36 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Bhandarkar:2001:ALB,
  author =       "Milind Bhandarkar and L. V. Kal{\'e} and Eric de
                 Sturler and Jay Hoeflinger",
  title =        "Adaptive Load Balancing for {MPI} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2074",
  pages =        "108--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:30 MST 2002",
  bibsource =    "file://sunset.math.utah.edu/a/suncore0/export/home/0073/sy/beebe/tex/bib/lncs2001b.bib;
                 http://link.springer-ny.com/link/service/series/0558/tocs/t2074.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2074/20740108.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2074/20740108.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{biewski:2001:MOS,
  author =       "Maciej Go biewski and Jesper Larsson Tr{\"a}ff",
  title =        "{MPI-2} One-Sided Communications on a {Giganet SMP}
                 Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "16--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310016.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310016.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Blikberg:2001:NPA,
  author =       "Ragnhild Blikberg and Tor S{\o}revik",
  title =        "Nested parallelism: Allocation of threads to tasks and
                 {OpenMP} implementation",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "185--194",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C11%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Booth:2001:OML,
  author =       "Stephen Booth",
  title =        "Optimising the {MPI} Library for the {T3E}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2150",
  pages =        "80--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:53 MST 2002",
  bibsource =    "file://sunset.math.utah.edu/a/suncore0/export/home/0073/sy/beebe/tex/bib/lncs2001c.bib;
                 http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500080.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2150/21500080.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bova:2001:PPM,
  author =       "Steve W. Bova and Clay P. Breshears and Henry Gabb and
                 Bob Kuhn and Bill Magro and Rudolf Eigenmann and Greg
                 Gaertner and Stefano Salvini and Howard Scott",
  title =        "Parallel Programming with Message Passing and
                 Directives",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "3",
  number =       "5",
  pages =        "22--37",
  month =        sep # "\slash " # oct,
  year =         "2001",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/5992.947105",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Sat Feb 23 06:37:33 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://computer.org/cise/cs2001/c5022abs.htm;
                 http://dlib.computer.org/cs/books/cs2001/pdf/c5022.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Brunst:2001:POL,
  author =       "Holger Brunst and Hans-Christian Hoppe and Wolfgang E.
                 Nagel and Manuela Winkler",
  title =        "Performance Optimization for Large Scale Computing:
                 The Scalable {VAMPIR} Approach",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2074",
  pages =        "751--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:30 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2074.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2074/20740751.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2074/20740751.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bu:2001:PAC,
  author =       "Libor Bu and Pavel Tvrd{\'\i}k",
  title =        "A Parallel Algorithm for Connected Components on
                 Distributed Memory Machines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "280--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310280.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310280.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bubak:2001:PMS,
  author =       "Marian Bubak and W{\l}odzimierz Funika and Bartosz
                 Bali and Roland Wism{\"u}ller",
  title =        "Performance Measurement Support for {MPI} Applications
                 with {PATOP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1947",
  pages =        "288--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:51 MST 2002",
  bibsource =    "file://sunset.math.utah.edu/a/suncore0/export/home/0073/sy/beebe/tex/bib/lncs2001a.bib;
                 http://link.springer-ny.com/link/service/series/0558/tocs/t1947.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1947/19470288.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1947/19470288.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bull:2001:MSO,
  author =       "J. Mark Bull and Darragh O'Neill",
  title =        "A microbenchmark suite for {OpenMP 2.0}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "41--48",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Cappello:2001:UPS,
  author =       "Franck Cappello and Olivier Richard and Daniel
                 Etiemble",
  title =        "Understanding performance of {SMP} clusters running
                 {MPI} programs",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "17",
  number =       "6",
  pages =        "711--720",
  month =        apr,
  year =         "2001",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 12:41:21 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/0167739X;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/19/19/45/33/30/abstract.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Caubet:2001:DTM,
  author =       "Jordi Caubet and Judit Gimenez and Jesus Labarta and
                 Luiz DeRose",
  title =        "A Dynamic Tracing Mechanism for Performance Analysis
                 of {OpenMP} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "53--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040053.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040053.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Chandra:2001:PPO,
  author =       "Rohit Chandra and Leonardo Dagum and David Kohr and
                 Dror Maydan and Jeff McDonald and Ramesh Menon",
  title =        "Parallel Programming in {OpenMP}",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xvi + 230",
  year =         "2001",
  ISBN =         "1-55860-671-8",
  ISBN-13 =      "978-1-55860-671-5",
  LCCN =         "QA76.642 .P38 2001",
  bibdate =      "Thu Jul 14 11:09:17 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib",
  price =        "US\$39.95",
  URL =          "http://www.mkp.com/books_catalog/catalog.asp?ISBN=1-55860-671-8",
  abstract =     "The rapid and widespread acceptance of shared memory
                 multiprocessor architectures has created a pressing
                 demand for an efficient way to program these systems.
                 At the same time, developers of technical and
                 scientific applications in industry and in government
                 laboratories find they need to parallelize huge volumes
                 of code in a portable fashion. OpenMP, developed
                 jointly by several parallel computing vendors to
                 address these issues, is an industry-wide standard for
                 programming shared-memory and distributed shared-memory
                 multiprocessors. It consists of a set of compiler
                 directives and library routines that extend FORTRAN, C,
                 and C++ codes to express shared-memory parallelism.
                 Parallel Programming in OpenMP is the first book to
                 teach both the novice and expert parallel programmers
                 how to program using this new standard. The authors,
                 who helped design and implement OpenMP while at SGI,
                 bring a depth and breadth to the book as compiler
                 writers, application developers, and performance
                 engineers.",
  acknowledgement = ack-nhfb,
  keywords =     "parallel programming (computer science)",
  tableofcontents = "Foreword \\
                 Preface \\
                 1: Introduction \\
                 Performance with OpenMP \\
                 A first glimpse of OpenMP \\
                 The OpenMP parallel computer \\
                 Why OpenMP \\
                 History of OpenMP \\
                 Navigating the rest of the book \\
                 2: Getting started with OpenMP \\
                 3: Exploiting loop-level parallelism \\
                 Meaning of the parallel do directive \\
                 Controlling data sharing \\
                 Removing data dependences \\
                 Enhancing performance \\
                 4: Beyond loop-level parallelism, parallel regions \\
                 5: Synchronization \\
                 6: Performance",
}

@Article{Chapman:2001:PDE,
  author =       "B. Chapman and O. Hernandez and A. Patil and A.
                 Prabhakar",
  title =        "Program Development Environment for {OpenMP} Programs
                 on {ccNUMA} Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2179",
  pages =        "210--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:06:22 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2179.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2179/21790210.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2179/21790210.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chen:2001:FFT,
  author =       "Qun Chen and Michael C. Ferris",
  title =        "{FATCOP}: a Fault Tolerant {Condor--PVM} Mixed Integer
                 Programming Solver",
  journal =      j-SIAM-J-OPT,
  volume =       "11",
  number =       "4",
  pages =        "1019--1036",
  month =        mar # "\slash " # may,
  year =         "2001",
  CODEN =        "SJOPE8",
  DOI =          "https://doi.org/10.1137/S1052623499353911",
  ISSN =         "1052-6234 (print), 1095-7189 (electronic)",
  ISSN-L =       "1052-6234",
  MRclass =      "90C11 (65K05)",
  MRnumber =     "MR1855219 (2002f:90068)",
  bibdate =      "Sat Oct 4 12:16:05 MDT 2003",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIOPT/11/4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 MathSciNet database",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/35391",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Optimization",
  journal-URL =  "http://epubs.siam.org/siopt",
}

@Article{Chen:2001:TMK,
  author =       "Yu Chen and Qian Fang and Zhihui Du and Sanli Li",
  title =        "{TH-MPI}: {OS} Kernel Integrated Fault Tolerant
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "75--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310075.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310075.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Czarnul:2001:DPD,
  author =       "Pawel Czarnul and Karen Tomko and Henryk Krawczyk",
  title =        "Dynamic Partitioning of the Divide-and-Conquer Scheme
                 with Migration in {PVM} Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "174--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310174.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310174.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Darema:2001:SMP,
  author =       "Frederica Darema",
  title =        "The {SPMD} Model: Past, Present and Future",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "1--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310001.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310001.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dehne:2001:CPD,
  author =       "Frank Dehne and Todd Eavis and Andrew Rau-Chaplin",
  title =        "Computing Partial Data Cubes for Parallel Data
                 Warehousing Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "319--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310319.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310319.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Demaine:2001:GCM,
  author =       "E. D. Demaine and I. Foster and C. Kesselman and M.
                 Snir",
  title =        "Generalized Communicators in the Message Passing
                 Interface",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "12",
  number =       "6",
  pages =        "610--616",
  month =        jun,
  year =         "2001",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.932714",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Jul 20 11:51:59 MDT 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/td/books/td2001/pdf/l0610.pdf;
                 http://www.computer.org/tpds/td2001/l0610abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Denis:2001:THP,
  author =       "Alexandre Denis and Christian P{\'e}rez and Thierry
                 Priol",
  title =        "Towards High Performance {CORBA} and {MPI} Middlewares
                 for Grid Computing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2242",
  pages =        "14--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:01 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2242.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2242/22420014.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2242/22420014.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{DiMartino:2001:WDS,
  author =       "Beniamino {Di Martino} and Sergio Briguglio and
                 Gregorio Vlad and Giuliana Fogaccia",
  title =        "Workload decomposition strategies for shared memory
                 parallel systems with {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "109--122",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C5%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Elwasif:2001:AMT,
  author =       "Wael R. Elwasif and David E. Bernholdt and James A.
                 Kohl and G. A. Geist",
  title =        "An Architecture for a Multi-threaded Harness Kernel",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "126--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310126.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310126.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fagg:2001:FTM,
  author =       "Graham E. Fagg and Antonin Bukovsky and Jack J.
                 Dongarra",
  title =        "Fault Tolerant {MPI} for the {HARNESS} Meta-computing
                 System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2073",
  pages =        "355--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:28 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2073.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2073/20730355.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2073/20730355.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fagg:2001:HFT,
  author =       "Graham E. Fagg and Antonin Bukovsky and Jack J.
                 Dongarra",
  title =        "{HARNESS} and fault tolerant {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "27",
  number =       "11",
  pages =        "1479--1495",
  month =        oct,
  year =         "2001",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 22 16:52:42 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/35/21/47/41/32/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/47/41/32/article.pdf;
                 http://www.netlib.org/utk/people/JackDongarra/PAPERS/harness-ftmpi-pc.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Fagg:2001:PIS,
  author =       "Graham E. Fagg and Edgar Gabriel and Michael Resch and
                 Jack J. Dongarra",
  title =        "Parallel {IO} Support for Meta-computing Applications:
                 {MPI\_Connect IO} Applied to {PACX--MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "135--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310135.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310135.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ferschweiler:2001:CDP,
  author =       "Ken Ferschweiler and Mariacarla Calzarossa and Cherri
                 Pancake and Daniele Tessera and Dylan Keon",
  title =        "A Community Databank for Performance Tracefiles",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "233--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310233.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310233.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Field:2001:RTF,
  author =       "Antony J. Field and Thomas L. Hansen and Paul H. J.
                 Kelly",
  title =        "Run-Time Fusion of {MPI} Calls in a Parallel {C++}
                 Library",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2017",
  pages =        "363--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2017.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2017/20170363.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2017/20170363.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fischer:2001:DNM,
  author =       "Markus Fischer and Peter Kemper",
  title =        "Distributed Numerical {Markov} Chain Analysis",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "272--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310272.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310272.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Fischer:2001:SAN,
  author =       "Markus Fischer",
  title =        "System Area Network Extensions to the Parallel Virtual
                 Machine",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "98--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310098.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310098.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Friedel:2001:HMC,
  author =       "Peter Friedel and J{\"o}rg Bergmann and Stephan Seidl
                 and Wolfgang E. Nagel",
  title =        "An Hierarchical {MPI} Communication Model for the
                 Parallelized Solution of Multiple Integrals",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2110",
  pages =        "474--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:11 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2110.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2110/21100474.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2110/21100474.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gaito:2001:ADC,
  author =       "A. Gaito and M. Rak and U. Villano",
  title =        "Adding Dynamic Coscheduling Support to {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "106--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310106.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310106.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gallud:2001:EDF,
  author =       "J. A. Gallud and J. Garc{\'\i}a-Consuegra and J. M.
                 Garc{\'\i}a and L. Orozco",
  title =        "Evaluating the {DIPORSI} Framework: Distributed
                 Processing of Remotely Sensed Imagery",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "401--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310401.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310401.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Geist:2001:BFN,
  author =       "G. Al Geist",
  title =        "Building a Foundation for the Next {PVM}: {Petascale
                 Virtual Machines}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "2--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310002.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310002.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gerlach:2001:IOJ,
  author =       "Jens Gerlach and Zheng-Yu Jiang and Hans-Werner Pohl",
  title =        "Integrating {OpenMP} into {Janus}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "101--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040101.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040101.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gine:2001:MMM,
  author =       "Francesc Gin{\'e} and Francesc Solsona and Xavi
                 Navarro and Porfidio Hern{\'a}ndez and Emilio Luque",
  title =        "{MemTo}: a Memory Monitoring Tool for a {Linux}
                 Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "225--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310225.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310225.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Golbiewski:2001:MOS,
  author =       "Maciej Go{\l}biewski and Jesper Larsson Tr{\"a}ff",
  title =        "{MPI-2} One-Sided Communications on a {Giganet SMP}
                 Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "16--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310016.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310016.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gonzalez:2001:DSP,
  author =       "M. Gonzalez and E. Ayguad{\'e} and X. Martorell and J.
                 Labarta",
  title =        "Defining and Supporting Pipelined Executions in
                 {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "155--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040155.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040155.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gonzalez:2001:MIM,
  author =       "J. A. Gonz{\'a}lez and C. Le{\'o}n and C.
                 Rodr{\'\i}guez and F. Sande",
  title =        "A Model to Integrate Message Passing and Shared Memory
                 Programming",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "114--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310114.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310114.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gonzalez:2001:OET,
  author =       "Marc Gonzalez and Jose Oliver and Xavier Martorell and
                 Eduard Ayguade and Jesus Labarta and Nacho Navarro",
  title =        "{OpenMP} Extensions for Thread Groups and Their
                 Run-Time Support",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2017",
  pages =        "324--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2017.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2017/20170324.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2017/20170324.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gorzig:2001:CCP,
  author =       "Steffen G{\"o}rzig",
  title =        "{CPPvm} --- {C++} and {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "83--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310083.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310083.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2001:CSA,
  author =       "William D. Gropp",
  title =        "Challenges and Successes in Achieving the Potential of
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "7--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310007.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310007.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2001:LSM,
  author =       "William D. Gropp",
  title =        "Learning from the Success of {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2228",
  pages =        "81--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:07:14 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2228.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2228/22280081.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2228/22280081.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hoeflinger:2001:IPV,
  author =       "Jay Hoeflinger and Bob Kuhn and Wolfgang Nagel and
                 Paul Petersen and Hrabri Rajic and Sanjiv Shah and Jeff
                 Vetter and Michael Voss and Renee Woo",
  title =        "An Integrated Performance Visualizer for {MPI\slash
                 OpenMP} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "40--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040040.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040040.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hoeflinger:2001:PSP,
  author =       "Jay Hoeflinger and Prasad Alavilli and Thomas Jackson
                 and Bob Kuhn",
  title =        "Producing scalable performance with {OpenMP}:
                 {Experiments} with two {CFD} applications",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "27",
  number =       "4",
  pages =        "391--413",
  month =        mar,
  year =         "2001",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Jul 18 06:31:15 MDT 2001",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.nl/gej-ng/10/35/21/47/28/26/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/47/28/26/article.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Hu:2001:PCC,
  author =       "Hong Hu and Edward L. Turner",
  title =        "Parallel {CFD} Computing Using Shared Memory
                 {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2073",
  pages =        "1137--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:28 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2073.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2073/20731137.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2073/20731137.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Huband:2001:DTB,
  author =       "Simon Huband and Chris McDonald",
  title =        "{DEPICT}: a Topology-Based Debugger for {MPI}
                 Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2026",
  pages =        "109--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:43 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2026.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2026/20260109.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2026/20260109.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Huse:2001:LST,
  author =       "Lars Paul Huse",
  title =        "Layering {SHMEM} on Top of {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "44--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310044.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310044.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ilroy:2001:IMP,
  author =       "Jonathan Ilroy and Cyrille Randriamaro and Gil Utard",
  title =        "Improving {MPI-I/O} Performance on {PVFS}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2150",
  pages =        "911--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:53 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500911.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2150/21500911.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Iwama:2001:PLS,
  author =       "Kazuo Iwama and Daisuke Kawai and Shuichi Miyazaki and
                 Yasuo Okabe and Jun Umemoto",
  title =        "Parallelizing Local Search for {CNF} Satisfiability
                 Using Vectorization and {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1982",
  pages =        "123--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:03 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1982.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1982/19820123.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1982/19820123.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Jorba:2001:SFF,
  author =       "Josep Jorba and Tom{\`a}s Margalef and Emilio Luque",
  title =        "Simulation of Forest Fire Propagation on Parallel {\&}
                 Distributed {PVM} Platforms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "386--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310386.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310386.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kaiser:2001:OCC,
  author =       "Timothy H. Kaiser and Scott B. Baden",
  title =        "Overlapping communication and computation with
                 {OpenMP} and {MPI}",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "73--81",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C2%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Kambites:2001:OLI,
  author =       "M. E. Kambites and J. Obdr{\v{z}}{\'a}lek and J. M.
                 Bull",
  title =        "An {OpenMP}-like interface for parallel programming in
                 {Java}",
  journal =      j-CCPE,
  volume =       "13",
  number =       "8--9",
  pages =        "793--814",
  month =        jul # "\slash " # aug,
  year =         "2001",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.579",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Jul 25 10:55:47 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/84503220/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=84503220&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Kasahara:2001:ACG,
  author =       "Hironori Kasahara and Motoki Obata and Kazuhisa
                 Ishizaka",
  title =        "Automatic Coarse Grain Task Parallel Processing on
                 {SMP} Using {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2017",
  pages =        "189--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2017.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2017/20170189.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2017/20170189.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kobler:2001:DOP,
  author =       "Rene Kobler and Dieter Kranzlm{\"u}ller and Jens
                 Volkert",
  title =        "Debugging {OpenMP} Programs Using Event Manipulation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "81--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040081.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040081.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Konstantinou:2001:TTO,
  author =       "Dimitris Konstantinou and Nectarios Koziris and George
                 Papakonstantinou",
  title =        "{TOPPER}: a Tool for Optimizing the Performance of
                 Parallel Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "148--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310148.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310148.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kranzlmuller:2001:IRM,
  author =       "Dieter Kranzlm{\"u}ller and Christian
                 Schaubschl{\"a}ger and Jens Volkert",
  title =        "An Integrated Record{\&}Replay Mechanism for
                 Nondeterministic Message Passing Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "192--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310192.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310192.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Krawczyk:2001:PIM,
  author =       "Henryk Krawczyk and Jamil Saif",
  title =        "Parallel Image Matching on {PC} Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "312--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310312.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310312.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kucukboyaci:2001:PPT,
  author =       "Vefa Kucukboyaci and Alireza Haghighat and Glenn E.
                 Sjoden",
  title =        "Performance of {PENTRAN TM} {$3$-D} Parallel Particle
                 Transport Code on the {IBM SP2} and {PCTRAN} Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "36--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310036.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310036.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kusano:2001:OOC,
  author =       "Kazuhiro Kusano and Mitsuhisa Sato and Takeo Hosomi
                 and Yoshiki Seo",
  title =        "The {Omni OpenMP} Compiler on the Distributed Shared
                 Memory of {Cenju-4}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "20--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040020.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040020.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Labarta:2001:NOD,
  author =       "J. Labarta and J. Oliver and D. S. Henty and Eduard
                 Ayguad{\'e}",
  title =        "New {OpenMP} directives for irregular data access
                 loops",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "175--183",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C10%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
  xxauthor =     "J. Labarta and E. Ayguad{\'e} and J. Oliver and
                 others",
}

@Article{Laforenza:2001:PHP,
  author =       "Domenico Laforenza",
  title =        "Programming High Performance Applications in Grid
                 Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "8--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310008.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310008.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lee:2001:APT,
  author =       "D. J. Lee and T. J. Downar",
  title =        "The Application of {POSIX} Threads and {OpenMP} to the
                 {U.S. NRC} Neutron Kinetics Code {PARCS}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "90--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040090.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040090.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Li:2001:PCS,
  author =       "Michael Na Li and A. J. Rossini",
  title =        "\pkg{RPVM}: Cluster Statistical Computing in {R}",
  journal =      j-R-NEWS,
  volume =       "1",
  number =       "3",
  pages =        "4--7",
  month =        sep,
  year =         "2001",
  CODEN =        "????",
  ISSN =         "1609-3631",
  ISSN-L =       "1609-3631",
  bibdate =      "Thu Aug 13 09:25:10 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/rjournal.bib",
  URL =          "http://CRAN.R-project.org/doc/Rnews/",
  acknowledgement = ack-r-project,
  fjournal =     "R News: the Newsletter of the R Project",
  journal-URL =  "http://journal.r-project.org/",
  pdf =          Rnews2001-3,
}

@Article{Li:2001:WMB,
  author =       "Maozhen Li and Omer F. Rana and David W. Walker",
  title =        "Wrapping {MPI}-based legacy codes as {Java\slash
                 CORBA} components",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "18",
  number =       "2",
  pages =        "213--223",
  month =        oct,
  year =         "2001",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 12:41:22 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/0167739X;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/19/19/60/31/29/abstract.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Luecke:2001:SPO,
  author =       "Glenn R. Luecke and Wei-Hua Lin",
  title =        "Scalability and performance of {OpenMP} and {MPI} on a
                 128-processor {SGI Origin 2000}",
  journal =      j-CCPE,
  volume =       "13",
  number =       "10",
  pages =        "905--928",
  day =          "25",
  month =        aug,
  year =         "2001",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.588",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Feb 25 14:51:23 MST 2002",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/85007180/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=85007180&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Luo:2001:PDE,
  author =       "Jun Luo and Sanguthevar Rajasekaran and Chenxia Qiu",
  title =        "Parallizing $1$-Dimensional Estuarine Model",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "257--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310257.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310257.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Macias:2001:PPA,
  author =       "Elsa M. Mac{\'\i}as and Alvaro Su{\'a}rez and C. N.
                 Ojeda-Guerra and E. Robayna",
  title =        "Programming Parallel Applications with {LAMGAC} in a
                 {LAN--WLAN} Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "158--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310158.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310158.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Malfetti:2001:AOW,
  author =       "Paolo Malfetti",
  title =        "Application of {OpenMP} to weather, wave and ocean
                 codes",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "99--107",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C4%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Manis:2001:PNP,
  author =       "G. Manis",
  title =        "Persistent and Non-persistent Data Objects on Top of
                 {PVM} and {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "91--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310091.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310091.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Matthey:2001:EMO,
  author =       "T. Matthey and J. P. Hansen",
  title =        "Evaluation of {MPI}'s One-Sided Communication
                 Mechanism for Short-Range Molecular Dynamics on the
                 {Origin2000}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1947",
  pages =        "356--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:51 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1947.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1947/19470356.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1947/19470356.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mattson:2001:EO,
  author =       "Timothy Mattson",
  title =        "The Evolution of {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1947",
  pages =        "19--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:51 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1947.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1947/19470019.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1947/19470019.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Matuszek:2001:APS,
  author =       "Mariusz R. Matuszek",
  title =        "Assessment of {PVM} Suitability to Testbed
                 Client-Agent-Server Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "69--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310069.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310069.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Michailidis:2001:TSH,
  author =       "Panagiotis D. Michailidis and Konstantinos G.
                 Margaritis",
  title =        "Text Searching on a Heterogeneous Cluster of
                 Workstations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "378--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310378.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310378.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Min:2001:PCO,
  author =       "Seung Jai Min and Seon Wook Kim and Michael Voss and
                 Sang Ik Lee and Rudolf Eigenmann",
  title =        "Portable Compilers for {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "11--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040011.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040011.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Moore:2001:RPA,
  author =       "Shirley Moore and David Cronk and Kevin London and
                 Jack Dongarra",
  title =        "Review of Performance Analysis Tools for {MPI}
                 Parallel Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "241--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310241.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310241.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Moreno:2001:AEP,
  author =       "Luz Marina Moreno and Francisco Almeida and Daniel
                 Gonz{\'a}lez and Casiano Rodr{\'\i}guez",
  title =        "Adaptive Execution of Pipelines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "217--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310217.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310217.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Muller:2001:SSO,
  author =       "Matthias M{\"u}ller",
  title =        "Some Simple {OpenMP} Optimization Techniques",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "31--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040031.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040031.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Neophytou:2001:NDW,
  author =       "Neophytos Neophytou and Paraskevas Evripidou",
  title =        "{Net-dbx}: a {Web}-Based Debugger of {MPI} Programs
                 Over Low-Bandwidth Lines",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "12",
  number =       "9",
  pages =        "986--995",
  month =        sep,
  year =         "2001",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.954636",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Sat Feb 23 09:26:03 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/td/books/td2001/pdf/l0986.pdf;
                 http://www.computer.org/tpds/td2001/l0986abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Nicolescu:2001:DTP,
  author =       "Cristina Nicolescu and Pieter Jonker",
  title =        "A Data and Task Parallel Image Processing
                 Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "393--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310393.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310393.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nikolopoulos:2001:EMA,
  author =       "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e}
                 and J. Labarta",
  title =        "Exploiting memory affinity in {OpenMP} through
                 schedule reuse",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "49--55",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Nikolopoulos:2001:SID,
  author =       "Dimitrios S. Nikolopoulos and Eduard Ayguad{\'e}",
  title =        "A Study of Implicit Data Distribution Methods for
                 {OpenMP} Using the {SPEC} Benchmarks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "115--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040115.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040115.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Okulicka-Dluzewska:2001:PFE,
  author =       "Felicja Okulicka-D{\l}uzewska",
  title =        "Parallelization of Finite Element Package by {MPI}
                 Library",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "427--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310427.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310427.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ong:2001:SUC,
  author =       "Emil Ong and Ewing Lusk and William Gropp",
  title =        "Scalable {Unix} Commands for Parallel Processors: a
                 High-Performance Implementation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "410--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310410.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310410.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pagourtzis:2001:PCT,
  author =       "Aris Pagourtzis and Igor Potapov and Wojciech Rytter",
  title =        "{PVM} Computation of the Transitive Closure: The
                 Dependency Graph Approach",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "249--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310249.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310249.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Papadopoulos:2001:NRC,
  author =       "Philip M. Papadopoulos and Mason J. Katz and Greg
                 Bruno",
  title =        "{NPACI} Rocks Clusters: Tools for Easily Deploying and
                 Maintaining Manageable High-Performance {Linux}
                 Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "10--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310010.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310010.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Park:2001:CSL,
  author =       "So-Hee Park and Mi-Young Park and Yong-Kee Jun",
  title =        "A Comparison of Scalable Labeling Schemes for
                 Detecting Races in {OpenMP} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "68--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040068.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040068.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Park:2001:PPE,
  author =       "Insung Park and Michael J. Voss and Seon Wook Kim and
                 Rudolf Eigenmann",
  title =        "Parallel programming environment for {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "143--161",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C8%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
  xxpages =      "143--162",
}

@Article{Pears:2001:DLB,
  author =       "Arnold N. Pears and Nicola Thong",
  title =        "A Dynamic Load Balancing Architecture for {PDES} Using
                 {PVM} on Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "166--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310166.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310166.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pedroso:2001:WLE,
  author =       "Hern{\^a}ni Pedroso and Jo{\~a}o Gabriel Silva",
  title =        "The {WMPI} Library Evolution: Experience with {MPI}
                 Development for {Windows} Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "1157--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19001157.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19001157.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Petcu:2001:WMM,
  author =       "Dana Petcu",
  title =        "Working with Multiple {Maple} Kernels Connected by
                 {Distributed Maple} or {PVMaple}",
  type =         "Technical report",
  institution =  "Westers University of Timisoara",
  address =      "Timisoara, Romania",
  month =        mar,
  year =         "2001",
  bibdate =      "Wed Dec 17 18:07:37 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.risc.uni-linz.ac.at/software/distmaple/index_1.html",
  URL =          "http://www.risc.uni-linz.ac.at/software/distmaple/misc/petcu2001.ps.gz",
  acknowledgement = ack-nhfb,
  keywords =     "Distributed Maple; PVMaple",
}

@Article{Plagianakos:2001:LCP,
  author =       "V. P. Plagianakos and N. K. Nousis and M. N.
                 Vrahatis",
  title =        "Locating and computing in parallel all the simple
                 roots of special functions using {PVM}",
  journal =      j-J-COMPUT-APPL-MATH,
  volume =       "133",
  number =       "1--2",
  pages =        "545--554",
  day =          "1",
  month =        aug,
  year =         "2001",
  CODEN =        "JCAMDI",
  DOI =          "https://doi.org/10.1016/S0377-0427(00)00675-0",
  ISSN =         "0377-0427 (print), 1879-1778 (electronic)",
  ISSN-L =       "0377-0427",
  bibdate =      "Sat Feb 25 12:45:19 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 http://www.math.utah.edu/pub/tex/bib/jcomputapplmath2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0377042700006750",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational and Applied Mathematics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03770427",
}

@Article{Plunkett:2001:AMD,
  author =       "Craig L. Plunkett and Alfred G. Striz and J.
                 Sobieszczanski-Sobieski",
  title =        "Application of {MPI} in Displacement Based Multilevel
                 Structural Optimization",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "335--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310335.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310335.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pringle:2001:TPF,
  author =       "Gavin J. Pringle and Steven P. Booth and Hugh M. P.
                 Couchman and Frazer R. Pearce and Alan D. Simpson",
  title =        "Towards a Portable, Fast Parallel {AP$^3$M-SPH} Code:
                 {HYDRA\_MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "360--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310360.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310360.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Prost:2001:MIG,
  author =       "Jean-Pierre Prost and Richard Treumann and Richard
                 Hedges and Bin Jia and Alice Koniges",
  title =        "{MPI-IO\slash GPFS}, an Optimized Implementation of
                 {MPI-IO} on top of {GPFS}",
  crossref =     "ACM:2001:SHP",
  pages =        "??--??",
  year =         "2001",
  bibdate =      "Sat Feb 10 14:28:55 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2001.org/papers/pap.pap186.pdf",
  acknowledgement = ack-nhfb,
  pagecount =    "15",
}

@Article{Prost:2001:THP,
  author =       "Jean-Pierre Prost and Richard Treumann and Richard
                 Hedges and Alice Koniges and Alison White",
  title =        "Towards a High-Performance Implementation of {MPI--IO}
                 on Top of {GPFS}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "1253--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19001253.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19001253.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Protopopov:2001:MMP,
  author =       "Boris V. Protopopov and Anthony Skjellum",
  title =        "A Multithreaded {Message Passing Interface (MPI)}
                 Architecture: Performance and Program Issues",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "61",
  number =       "4",
  pages =        "449--466",
  day =          "1",
  month =        apr,
  year =         "2001",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.2000.1674",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Feb 22 15:30:36 MST 2002",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Rabenseifner:2001:ECF,
  author =       "Rolf Rabenseifner and Alice E. Koniges",
  title =        "Effective Communication and File-{I/O} Bandwidth
                 Benchmarks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "24--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310024.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310024.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Rageb:2001:CEM,
  author =       "Khaled Rageb and Wolfgang Rehm",
  title =        "{CHEMPI}: efficient {MPI} for {VIA\slash SCI}",
  type =         "{Preprint-Reihe des Chemnitzer}",
  number =       "{SFB 393}",
  institution =  "Technische Universit{\"a}t Chemnitz",
  address =      "Chemnitz, Germany",
  pages =        "12",
  year =         "2001",
  bibdate =      "Wed Aug 27 06:45:29 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Reinefeld:2001:CDI,
  author =       "Alexander Reinefeld",
  title =        "Clusters for Data-Intensive Applications in the Grid",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "12--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310012.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310012.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Reussner:2001:APP,
  author =       "Ralf Reussner and Gunnar Hunzelmann",
  title =        "Achieving Performance Portability with {SKaMPI} for
                 High-Performance {MPI} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2074",
  pages =        "841--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:30 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2074.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2074/20740841.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2074/20740841.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Reussner:2001:SSK,
  author =       "Ralf H. Reussner",
  title =        "{SKaMPI}: the special {Karlsruher} {MPI}-benchmark:
                 user manual",
  type =         "{Interner Bericht}",
  number =       "99,02",
  institution =  "Fakult{\"a}t f{\"u}r Informatik, Universit{\"a}t
                 Karlsruhe",
  address =      "Karlsruhe, Germany",
  pages =        "78",
  year =         "2001",
  bibdate =      "Wed Aug 27 06:47:26 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Roig:2001:EMM,
  author =       "Concepci{\'o} Roig and Ana Ripoll and Javier
                 Borr{\'a}s and Emilio Luque",
  title =        "Efficient Mapping for Message-Passing Applications
                 Using the {TTIG} Model: a Case Study in Image
                 Processing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "370--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310370.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310370.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Roussos:2001:BMB,
  author =       "George Roussos and B. J. C. Baxter",
  title =        "Biharmonic Many Body Calculations for Fast Evaluation
                 of Radial Basis Function Interpolants in Cluster
                 Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "288--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310288.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310288.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Rungsawang:2001:LCP,
  author =       "A. Rungsawang and A. Laohakanniyom and M.
                 Lertprasertkune",
  title =        "Low-Cost Parallel Text Retrieval Using {PC}-Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "419--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310419.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310419.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sahimi:2001:AAS,
  author =       "Mohd Salleh Sahimi and Norma Alias and Elankovan
                 Sundararajan",
  title =        "The {AGEB} Algorithm for Solving the Heat Equation in
                 Three Space Dimensions and Its Parallelization Using
                 {PVM}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2073",
  pages =        "918--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:28 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2073.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2073/20730918.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2073/20730918.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sato:2001:CEO,
  author =       "Mitsuhisa Sato and Hiroshi Harada and Atsushi Hasegawa
                 and Yutaka Ishikawa",
  title =        "Cluster-enabled {OpenMP}: An {OpenMP} compiler for the
                 {SCASH} software distributed shared memory system",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "123--130",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C6%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Sato:2001:OGR,
  author =       "Mitsuhisa Sato and Motonari Hirano and Yoshio Tanaka
                 and Satoshi Sekiguchi",
  title =        "{OmniRPC}: a {Grid} {RPC} Facility for Cluster and
                 Global Computing in {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2104",
  pages =        "130--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:04 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2001b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2104/21040130.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2104/21040130.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Satoh:2001:COT,
  author =       "Shigehisa Satoh and Kazuhiro Kusano and Mitsuhisa
                 Sato",
  title =        "Compiler optimization techniques for {OpenMP}
                 programs",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "131--142",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C7%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Schevtschenko:2001:PAS,
  author =       "I. V. Schevtschenko",
  title =        "A Parallel {ADI} and Steepest Descent Methods",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "265--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310265.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310265.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shan:2001:CMS,
  author =       "Hongzhang Shan and Jaswinder Pal Singh",
  title =        "A Comparison of {MPI}, {SHMEM} and Cache-Coherent
                 Shared Address Space Programming Models on a
                 Tightly-Coupled Multiprocessors",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "29",
  number =       "3",
  pages =        "283--318",
  month =        jun,
  year =         "2001",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Feb 20 09:55:15 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ipsapp009.lwwonline.com/content/getfile/4773/21/3/abstract.htm;
                 http://ipsapp009.lwwonline.com/content/getfile/4773/21/3/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Skjellum:2001:OOA,
  author =       "Anthony Skjellum and Diane G. Wooley and Ziyang Lu and
                 Michael Wolf and Purushotham V. Bangalore and Andrew
                 Lumsdaine and Jeffrey M. Squyres and Brian McCandless",
  title =        "Object-oriented analysis and design of the {Message
                 Passing Interface}",
  journal =      j-CCPE,
  volume =       "13",
  number =       "4",
  pages =        "245--292",
  day =          "10",
  month =        apr,
  year =         "2001",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.556",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Jul 25 10:55:46 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/78502300/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=78502300&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Smith:2001:DMM,
  author =       "Lorna Smith and Mark Bull",
  title =        "Development of mixed mode {MPI\slash OpenMP}
                 applications",
  journal =      j-SCI-PROG,
  volume =       "9",
  number =       "2--3",
  pages =        "83--98",
  month =        "Spring--Summer",
  year =         "2001",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=7pab6qgbaf8vxg991rwy%26referrer=parent%26backto=issue%2C3%2C11%3Bjournal%2C1%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Solsona:2001:IEI,
  author =       "Francesc Solsona and Francesc Gin{\'e} and Porfidio
                 Hern{\'a}ndez and Emilio Luque",
  title =        "Implementing Explicit and Implicit Coscheduling in a
                 {PVM} Environment (Research Note)",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "1165--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19001165.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19001165.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{SousaPinto:2001:PEI,
  author =       "Jorge {Sousa Pinto}",
  title =        "Parallel Evaluation of Interaction Nets with {MPINE}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2051",
  pages =        "353--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:07 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2051.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2051/20510353.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2051/20510353.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sunderam:2001:CAP,
  author =       "Vaidy Sunderam and Zsolt N{\'e}meth",
  title =        "A Comparative Analysis of {PVM\slash MPI} and
                 Computational {Grids}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "14--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310014.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310014.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Suppi:2001:PCS,
  author =       "Remo Suppi and Fernando Cores and Emilio Luque",
  title =        "{PDES}: a Case Study Using the Switch Time Warp",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "327--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310327.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310327.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Swann:2001:SPC,
  author =       "Christopher A. Swann",
  title =        "Software for parallel computing: the {LAM}
                 implementation of {MPI}",
  journal =      j-J-APPL-ECONOMETRICS,
  volume =       "16",
  number =       "2",
  pages =        "185--194",
  month =        mar # "--" # apr,
  year =         "2001",
  CODEN =        "JAECET",
  DOI =          "https://doi.org/10.1002/jae.595",
  ISSN =         "0883-7252 (print), 1099-1255 (electronic)",
  ISSN-L =       "0883-7252",
  bibdate =      "Sat Mar 9 10:20:01 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jappleconometrics.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Applied Econometrics",
  journal-URL =  "https://onlinelibrary.wiley.com/journal/10991255;
                 https://www.jstor.org/journal/japplecon",
  onlinedate =   "23 April 2001",
}

@Article{Takeda:2001:AME,
  author =       "K. Takeda and N. K. Allsopp and J. C. Hardwick and P.
                 C. Macey and D. A. Nicole and S. J. Cox and D. J.
                 Lancaster",
  title =        "An Assessment of {MPI} Environments for {Windows NT}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "19",
  number =       "3",
  pages =        "315--323",
  month =        jul,
  year =         "2001",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 25 09:05:33 MDT 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/issuetoc.htm/0920-8542+19+3+2001",
  URL =          "http://www.wkap.nl/oasis.htm/338207",
  abstract =     "In this paper we evaluate the MPI environments
                 currently available for Windows NT on the Intel IA32
                 and Compaq DEC Alpha architectures. We present
                 benchmark results for low-level communication and for
                 the NAS Parallel Benchmarks to allow comparison with
                 other systems, but our primary interest is determining
                 real application performance and robustness in
                 production cluster environments. For this we use
                 PAFEC-FE, a large FORTRAN code for finite-element
                 analysis. We present results from three MPI
                 implementations, two architectures, and three
                 networking technologies (10 and 100 Mbit/s Ethernet and
                 1 Gbit/s Myrinet).",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Tinetti:2001:HNW,
  author =       "Fernando Tinetti and Antonio Quijano and Armando {De
                 Giusti} and Emilio Luque",
  title =        "Heterogeneous Networks of Workstations and the
                 Parallel Matrix Multiplication",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "296--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310296.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310296.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tourancheau:2001:SMN,
  author =       "Bernard Tourancheau and Roland Westrelin",
  title =        "Support for {MPI} at the Network Interface Level",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "52--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310052.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310052.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Trobec:2001:IEM,
  author =       "R. Trobec and M.{\v{S}}terk and M. Praprotnik and D.
                 Jane{\v{z}}i{\v{c}}",
  title =        "Implementation and evaluation of {MPI}-based parallel
                 {MD} program",
  journal =      j-IJQC,
  volume =       "84",
  number =       "1",
  pages =        "23--31",
  month =        "????",
  year =         "2001",
  CODEN =        "IJQCB2",
  DOI =          "https://doi.org/10.1002/qua.1303",
  ISSN =         "0020-7608 (print), 1097-461X (electronic)",
  ISSN-L =       "0020-7608",
  bibdate =      "Wed Jul 25 09:32:26 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/0020-7608;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/84002438/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext/84002438/FILE?TPL=ftx_start;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=84002438&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Quantum Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0020-7608/",
}

@Article{Uthayopas:2001:FSR,
  author =       "Putchong Uthayopas and Sugree Phatanapherom",
  title =        "Fast and Scalable Real-Time Monitoring System for
                 {Beowulf} Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "201--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310201.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310201.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Walker:2001:DLB,
  author =       "Reginald L. Walker",
  title =        "Dynamic Load Balancing Model: Preliminary Results for
                 Parallel Pseudo-search Engine Indexers\slash Crawler
                 Mechanisms Using {MPI} and Genetic Programming",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1981",
  pages =        "61--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:02 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1981.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1981/19810061.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1981/19810061.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Walker:2001:SEC,
  author =       "Reginald L. Walker",
  title =        "Search engine case study: searching the {Web} using
                 genetic programming and {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "27",
  number =       "1--2",
  pages =        "71--89",
  month =        jan,
  year =         "2001",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Jul 18 06:31:14 MDT 2001",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.nl/gej-ng/10/35/21/47/25/25/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/47/25/25/article.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Wismuller:2001:UMT,
  author =       "Roland Wism{\"u}ller",
  title =        "Using Monitoring Techniques to Support the Cooperation
                 of Software Components",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "183--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310183.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310183.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wolf:2001:APA,
  author =       "Felix Wolf and Bernd Mohr",
  title =        "Automatic Performance Analysis of {MPI} Applications
                 Based on Event Traces",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "123--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000123.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19000123.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wu:2001:PCS,
  author =       "Guang Jun Wu and Robert Roy",
  title =        "Parallelization of Characteristics Solvers for {$3$D}
                 Neutron Transport",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "344--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310344.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310344.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Yero:2001:JOO,
  author =       "Eduardo J. H. Yero and Marco A. A. Henriques and
                 Javier R. Garc{\"\i}a and Alina C. Leyva",
  title =        "{JOINT}: An Object Oriented Message Passing Interface
                 for Parallel Programming in {Java}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2110",
  pages =        "637--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:11 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2110.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2110/21100637.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2110/21100637.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Zhang:2001:PPV,
  author =       "Xin Zhang and Lingli Ding and Elke A. Rundensteiner",
  title =        "{PVM}: {Parallel View Maintenance} under Concurrent
                 Data Updates of Distributed Sources",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2114",
  pages =        "230--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:16 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2114.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2114/21140230.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2114/21140230.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Zoltani:2001:EPO,
  author =       "Csaba K. Zoltani and Punyam Satya-narayana and Dixie
                 Hisley",
  title =        "Evaluating Performance of {OpenMP} and {MPI} on the
                 {SGI Origin 2000} with Benchmarks of Realistic Problem
                 Sizes",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "4",
  number =       "4",
  pages =        "??--??",
  month =        dec,
  year =         "2001",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/4/4.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Acacio:2002:MDM,
  author =       "M. Acacio and O. C{\'a}novas and J. M. Garc{\'\i}a and
                 P. E. L{\'o}pez-de-Teruel",
  title =        "{MPI-Delphi}: an {MPI} implementation for visual
                 programming environments and heterogeneous computing",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "18",
  number =       "3",
  pages =        "317--333",
  month =        jan,
  year =         "2002",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 12:41:22 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/0167739X;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/19/19/60/32/28/abstract.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Bane:2002:EOA,
  author =       "M. K. Bane and G. D. Riley",
  title =        "Extended Overhead Analysis for {OpenMP} (Research
                 Note)",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2400",
  pages =        "162--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:10:14 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000162.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2400/24000162.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Basumallik:2002:TOE,
  author =       "Ayon Basumallik and Seung-Jai Min and Rudolf
                 Eigenmann",
  title =        "Towards {OpenMP} Execution on Software Distributed
                 Shared Memory Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "457--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270457.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270457.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bekas:2002:PCP,
  author =       "Constantine Bekas and Efrosini Kokiopoulou and
                 Efstratios Gallopoulos and Valeria Simoncini",
  title =        "Parallel Computation of Pseudospectra Using Transfer
                 Functions on a {MATLAB-MPI} Cluster Platform",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "199--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740199.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740199.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bisseling:2002:FMF,
  author =       "Georg Bi{\ss}eling and Hans-Christian Hoppe and
                 Alexander Supalov and Pierre Lagier and Jean Latour",
  title =        "{Fujitsu MPI-2}: Fast Locally, Reaching Globally",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "401--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740401.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740401.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Blanco:2002:PMA,
  author =       "V. Blanco and L. Garc{\'\i}a and J. A. Gonz{\'a}lez
                 and C. Rodr{\'\i}guez and G. Rodr{\'\i}guez",
  title =        "A Performance Model for the Analysis of {OpenMP}
                 Programs",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "139--151",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@InProceedings{Bosilca:2002:MVT,
  author =       "George Bosilca and Aurelien Bouteiller and Franck
                 Cappello and Samir Djilali and Gilles Fedak and Cecile
                 Germain and Thomas Herault and Pierre Lemarinier and
                 Oleg Lodygensky and Frederic Magniette and Vincent Neri
                 and Anton Selikhov",
  title =        "{MPICH-V}: Toward a Scalable Fault Tolerant {MPI} for
                 Volatile Nodes",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap298.pdf",
  abstract =     "Global Computing platforms, large scale clusters and
                 future TeraGRID systems gather thousands of nodes for
                 computing parallel scientific applications. At this
                 scale, node failures or disconnections are frequent
                 events. This Volatility reduces the MTBF of the whole
                 system in the range of hours or minutes. We present
                 MPICH-V, an automatic Volatility tolerant MPI
                 environment based on uncoordinated checkpoint/ rollback
                 and distributed message logging. MPICH-V architecture
                 relies on Channel Memories, Checkpoint servers and
                 theoretically proven protocols to execute existing or
                 new, SPMD and Master-Worker MPI applications on
                 volatile nodes. To evaluate its capabilities, we run
                 MPICH-V within a framework for which the number of
                 nodes, Channels Memories and Checkpoint Servers can be
                 completely configured as well as the node Volatility.
                 We present a detailed performance evaluation of every
                 component of MPICH-V and its global performance for
                 non-trivial parallel applications. Experimental results
                 demonstrate good scalability and high tolerance to node
                 volatility.",
  acknowledgement = ack-nhfb,
}

@Article{Brightwell:2002:DIM,
  author =       "Ron Brightwell and Arthur B. Maccabe and Rolf Riesen",
  title =        "Design and Implementation of {MPI} on {Portals 3.0}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "331--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740331.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740331.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Brightwell:2002:RMR,
  author =       "Ron Brightwell",
  title =        "Ready-Mode Receive: An Optimized Receive Function for
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "385--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740385.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740385.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Calderon:2002:IMI,
  author =       "Alejandro Calder{\'o}n and F{\'e}lix Garc{\'\i}a and
                 Jes{\'u}s Carretero and Jose M. P{\'e}rez and Javier
                 Fern{\'a}ndez",
  title =        "An Implementation of {MPI-IO} on Expand: a Parallel
                 File System Based on {NFS} Servers",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "306--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740306.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740306.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chapman:2002:APU,
  author =       "B. Chapman and F. Bregier and A. Patil and A.
                 Prabhakar",
  title =        "Achieving performance under {OpenMP} on {ccNUMA} and
                 software distributed shared memory systems",
  journal =      j-CCPE,
  volume =       "14",
  number =       "8--9",
  pages =        "713--739",
  month =        jul # "\slash " # aug,
  year =         "2002",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.646",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Nov 9 12:24:19 MST 2002",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/95016122/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=95016122{\&}PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Chapman:2002:PAD,
  author =       "Barbara Chapman",
  title =        "Parallel Application Development with the Hybrid {MPI
                 $+$ OpenMP} Programming Model",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "13--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740013.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740013.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cotronis:2002:MMP,
  author =       "Yiannis Cotronis and Zacharias Tsiatsoulis",
  title =        "Modular {MPI} and {PVM} Components",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "252--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740252.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740252.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Czarnul:2002:DTI,
  author =       "Pawel Czarnul",
  title =        "Development and Tuning of Irregular Divide-and-Conquer
                 Applications in {DAMPVM\slash DAC}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "208--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740208.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740208.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{DeRose:2002:CCG,
  author =       "L. DeRose and F. Wolf",
  title =        "{CATCH} --- a Call-Graph Based Automatic Tool for
                 Capture of Hardware Performance Metrics for {MPI} and
                 {OpenMP} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2400",
  pages =        "167--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:10:14 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000167.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2400/24000167.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Ding:2002:MOP,
  author =       "Yun He and Chris H. Q. Ding",
  key =          "multidimensional arrays; index reshuffle; vacancy
                 tracking cycles; global exchange; dynamical remapping;
                 MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.",
  title =        "{MPI} and {OpenMP} Paradigms on Cluster of {SMP}
                 Architectures",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2002.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf",
  abstract =     "We investigate remapping multi-dimensional arrays on
                 cluster of SMP architectures under OpenMP, MPI, and
                 hybrid paradigms. Traditional method of array transpose
                 needs an auxiliary array of the same size and a copy
                 back stage. We recently developed an in-place method
                 using vacancy tracking cycles. The vacancy tracking
                 algorithm outperforms the traditional 2-array method as
                 demonstrated by extensive comparisons. The independence
                 of vacancy tracking cycles allows efficient
                 parallelization of the in-place method on SMP
                 architectures at node level. Performance of
                 multi-threaded parallelism using OpenMP are tested with
                 different scheduling methods and different number of
                 threads. The vacancy tracking method is parallelized
                 using several parallel paradigms. At node level, pure
                 OpenMP outperforms pure MPI by a factor of 2.76. Across
                 entire cluster of SMP nodes, the hybrid MPI/OpenMP
                 implementation outperforms pure MPI by a factor of
                 4.44, demonstrating the validity of the parallel
                 paradigm of mixing MPI with OpenMP.",
  acknowledgement = ack-nhfb,
}

@Article{DiSerio:2002:ENN,
  author =       "Angela {Di Serio} and Mar{\'\i}a B. Ib{\'a}{\~n}ez",
  title =        "Evaluation of a Nearest-Neighbor Load Balancing
                 Strategy for Parallel Molecular Simulations in {MPI}
                 Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "226--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740226.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740226.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dow:2002:CMA,
  author =       "Chyi-Ren Dow and Jong-Shin Chen and Min-Chang Hsieh",
  title =        "Checkpointing {MPI} applications on symmetric
                 multi-processor machines using {SMPCkpt}",
  journal =      j-J-SYST-SOFTW,
  volume =       "63",
  number =       "2",
  pages =        "137--150",
  day =          "15",
  month =        aug,
  year =         "2002",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Sat Oct 25 07:14:09 MDT 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of systems and software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212",
}

@InProceedings{El-Ghazawi:2002:UPP,
  author =       "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet",
  title =        "{UPC} Performance and Potential: a {NPB} Experimental
                 Study",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf",
  abstract =     "UPC, or Unified Parallel C, is a parallel extension of
                 ANSI C. UPC follows a distributed shared memory
                 programming model aimed at leveraging the ease of
                 programming of the shared memory paradigm, while
                 enabling the exploitation of data locality. UPC
                 incorporates constructs that allow placing data near
                 the threads that manipulate them to minimize remote
                 accesses. This paper gives an overview of the concepts
                 and features of UPC and establishes, through extensive
                 performance measurements of NPB workloads, the
                 viability of the UPC programming language compared to
                 the other popular paradigms. Further, through
                 performance measurements we identify the challenges,
                 the remaining steps and the priorities for UPC. It will
                 be shown that with proper hand tuning libraries, UPC
                 performance will be comparable incorporating such
                 improvements into automatic compare quite favorably to
                 message passing in ease and optimized collective
                 operations to that of MPI. Furthermore, by compiler
                 optimizations, UPC will of programming.",
  acknowledgement = ack-nhfb,
  keywords =     "NPB (NAS Parallel Benchmark)",
}

@Article{Espenica:2002:PPA,
  author =       "Roberto Espenica and Pedro Medeiros",
  title =        "Porting {PVM} to the {VIA} Architecture Using a Fast
                 Communication Library",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "341--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740341.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740341.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Fagg:2002:FTM,
  author =       "Graham E. Fagg and Antonin Bukovsky and Sathish
                 Vadhiyar and Jack J. Dongarra",
  title =        "Fault Tolerant {MPI} for the {HARNESS MetaComputing}
                 System",
  type =         "Technical report",
  number =       "????",
  institution =  inst-UTK,
  address =      inst-UTK:adr,
  pages =        "14",
  year =         "2002",
  bibdate =      "Tue Jan 13 18:41:26 2004",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/netlib/utk/people/JackDongarra/PAPERS/ft-mpi-iccs-gef.pdf",
  acknowledgement = ack-nhfb,
}

@TechReport{Fagg:2002:HFTa,
  author =       "Graham E. Fagg and Jack J. Dongarra",
  title =        "{HARNESS} Fault Tolerant {MPI} Design, Usage and
                 Performance Issues",
  type =         "Technical report",
  number =       "????",
  institution =  inst-UTK,
  address =      inst-UTK:adr,
  year =         "2002",
  bibdate =      "Tue Jan 13 18:42:49 2004",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/netlib/utk/people/JackDongarra/PAPERS/ft-mpi-fgcs-grid-se.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Fagg:2002:HFTb,
  author =       "Graham E. Fagg and Jack J. Dongarra",
  title =        "{HARNESS} fault tolerant {MPI} design, usage and
                 performance issues",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "18",
  number =       "8",
  pages =        "1127--1142",
  month =        oct,
  year =         "2002",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Jan 10 10:03:29 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Field:2002:OSR,
  author =       "A. J. Field and P. H. J. Kelly and T. L. Hansen",
  title =        "Optimising Shared Reduction Variables in {MPI}
                 Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2400",
  pages =        "630--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:10:14 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000630.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2400/24000630.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Garg:2002:TOA,
  author =       "Rajat P. Garg and Ilya Sharapov",
  title =        "Techniques for optimizing applications: high
                 performance computing",
  publisher =    pub-SUN-MICROSYSTEMS-PRESS,
  address =      pub-SUN-MICROSYSTEMS-PRESS:adr,
  pages =        "xliii + 616",
  year =         "2002",
  ISBN =         "0-13-093476-3",
  ISBN-13 =      "978-0-13-093476-5",
  LCCN =         "QA76.88 .G37 2002",
  bibdate =      "Fri Apr 11 08:26:42 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib;
                 http://www.sun.com/blueprints/",
  series =       "Sun BluePrints Program",
  URL =          "http://www.sun.com/books/catalog/garg.html/index.html;
                 http://www.sun.com/solutions/blueprints/tools/",
  acknowledgement = ack-nhfb,
  annote =       "From the Web site: The \verb=HPC_code_examples.tar.Z=
                 tar-file contains the source code, makefiles, and shell
                 scripts required to compile, link, and run the example
                 programs discussed in the book.",
  keywords =     "Forte Developer; MPI; OpenMP; Sun ClusterTools; Sun
                 Solaris",
}

@Article{Gine:2002:ALT,
  author =       "Francesc Gin{\'e} and Francesc Solsona and Porfidio
                 Hern{\'a}ndez and Emilio Luque",
  title =        "Adjusting the Lengths of Time Slices when Scheduling
                 {PVM} Jobs with High Memory Requirements",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "156--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740156.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740156.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Goedecker:2002:OPF,
  author =       "Stefan Goedecker",
  title =        "Optimization and parallelization of a force field for
                 silicon using {OpenMP}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "148",
  number =       "1",
  pages =        "124--135",
  day =          "1",
  month =        oct,
  year =         "2002",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(02)00466-6",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:41:24 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465502004666",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Gonzalez:2002:DLP,
  author =       "Marc Gonz{\'a}lez and Eduard Ayguad{\'e} and Xavier
                 Martorell and Jes{\'u}s Labarta and Phu V. Luong",
  title =        "Dual-Level Parallelism Exploitation with {OpenMP} in
                 Coastal Ocean Circulation Modeling",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "469--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270469.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270469.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2002:BLC,
  author =       "William Gropp",
  title =        "Building Library Components that Can Use Any {MPI}
                 Implementation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "280--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740280.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740280.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2002:MG,
  author =       "William Gropp and Ewing Lusk",
  title =        "{MPI} on the {Grid}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "12--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740012.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740012.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2002:MNS,
  author =       "William Gropp",
  title =        "{MPICH2}: a New Start for {MPI} Implementations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "7--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740007.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740007.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hadjidoukas:2002:MOI,
  author =       "Panagiotis E. Hadjidoukas and Eleftherios D.
                 Polychronopoulos and Theodore S. Papatheodorou",
  title =        "A Modular {OpenMP} Implementation for Clusters of
                 Multiprocessors",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "153--168",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{He:2002:MOP,
  author =       "Yun He and Chris H. Q. Ding",
  title =        "{MPI} and {OpenMP} Paradigms on Cluster of {SMP}
                 Architectures: The Vacancy Tracking Algorithm for
                 Multi-Dimensional Array Transposition",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "117--128",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Heikonen:2002:ILB,
  author =       "Jussi Heikonen and Kalle Eerola",
  title =        "Improving Load Balance in a Weather Code: Asynchronous
                 Output in {HIRLAM} with {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2367",
  pages =        "567--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:54 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2367.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2367/23670567.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2367/23670567.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Huang:2002:DDD,
  author =       "Wei Huang and Zhe Wang and Jie Ma",
  title =        "Design of {DMPI} on {DAWNING-3000}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "314--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740314.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740314.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Huttunen:2002:MCC,
  author =       "Pentti Huttunen and Jouni Ikonen and Jari Porras",
  title =        "{MPIT} --- Communication\slash Computation Paradigm
                 for Networks of {SMP} Workstations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2367",
  pages =        "160--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Sep 12 08:36:35 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2367.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2367/23670160.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2367/23670160.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Isabel:2002:CMO,
  author =       "Dorta Isabel and Le{\'o}n Coromoto and Rodr{\'\i}guez
                 Casiano",
  title =        "Comparing {MPI} and {OpenMP} implementations of the
                 $0$-$1$ Knapsack Problem",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "129--137",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Islam:2002:IAC,
  author =       "Mohammad Towhidul Islam and Parimala Thulasiraman and
                 Ruppa K. Thulasiram",
  title =        "Implementation of Ant Colony Optimization Algorithm
                 for Mobile Ad hoc Network Applications: {OpenMP}
                 Experiences",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "177--191",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Iwama:2002:PLS,
  author =       "Kazuo Iwama and Daisuke Kawai and Shuichi Miyazaki and
                 Yasuo Okabe and Jun Umemoto",
  title =        "Parallelizing local search for {CNF} satisfiability
                 using vectorization and {PVM}",
  journal =      j-ACM-J-EXP-ALGORITHMICS,
  volume =       "7",
  pages =        "2--2",
  month =        "????",
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/944618.944620",
  ISSN =         "1084-6654",
  bibdate =      "Mon Oct 6 16:04:20 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The purpose of this paper is to speed up the local
                 search algorithm for the CNF Satisfiability problem.
                 Our basic strategy is to run some 10$^5$ independent
                 search paths simultaneously using PVM on a vector
                 supercomputer VPP800, which consists of 40 vector
                 processors. Using the above parallelization and
                 vectorization together with some improvement of data
                 structure, we obtained 600-times speedup in terms of
                 the number of flips the local search can make per
                 second, compared to the original GSAT by Selman and
                 Kautz. We ran our parallel GSAT for benchmark instances
                 and compared the running time with those of existing
                 SAT programs. We could observe an apparent benefit of
                 parallelization: Especially, we were able to solve two
                 instances that have never been solved before this
                 paper. We also tested parallel local search for the SAT
                 encoding of the class scheduling problem. Again we were
                 able to get almost the best answer in reasonable
                 time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal of Experimental Algorithmics",
  keywords =     "algorithms; CNF Satisfiability; distributed computing;
                 experimentation; local search algorithms;
                 parallelization; PVM; vector supercomputer;
                 vectorization",
}

@Article{Kabir:2002:DIS,
  author =       "Yacine Kabir and A. Belhadj-Aissa",
  title =        "Distributed Image Segmentation System by a
                 Multi-agents Approach (Under {PVM} Environment)",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "138--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740138.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740138.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Karniadakis:2002:DLP,
  author =       "Suchuan Dong and George Em. Karniadakis",
  title =        "Dual-Level Parallelism for Deterministic and
                 Stochastic {CFD} Problems",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2002.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf",
  abstract =     "A hybrid two-level parallelism using MPI/OpenMP is
                 implemented in the general-purpose spectral/hp element
                 CFD code NekTar to take advantage of the hierarchical
                 structures arising in deterministic and stochastic CFD
                 problems. We take a coarse grain approach to
                 shared-memory parallelism with OpenMP and employ a
                 workload-splitting scheme that can reduce the OpenMP
                 synchronizations to the minimum. The hybrid
                 implementation shows good scalability with respect to
                 both the problem size and the number of processors in
                 case of a fixed problem size. With the same number of
                 processors, the hybrid model with 2 (or 4) OpenMP
                 threads per MPI process is observed to perform better
                 than pure MPI and pure OpenMP on the NCSA SGI Origin
                 2000, while the pure MPI model performs the best on the
                 IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC.
                 A key new result is that the use of threads facilitates
                 effectively prefinement, which is crucial to adaptive
                 discretization using high-order methods.",
  acknowledgement = ack-nhfb,
}

@Book{Karniadakis:2002:PSC,
  author =       "George Em Karniadakis and Robert M. Kirby",
  title =        "Parallel Scientific Computing in {C++} and {MPI}: a
                 Seamless Approach to Parallel Algorithms",
  publisher =    pub-CAMBRIDGE,
  address =      pub-CAMBRIDGE:adr,
  pages =        "xi + 616",
  year =         "2002",
  ISBN =         "0-521-52080-0 (paperback), 0-521-81754-4 (hardcover)",
  ISBN-13 =      "978-0-521-52080-5 (paperback), 978-0-521-81754-7
                 (hardcover)",
  LCCN =         "QA76.58 .K37 2003",
  bibdate =      "Wed Aug 27 06:43:56 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  price =        "US\$50.00 (paperback), US\$130.00 (hardcover)",
  URL =          "ftp://uiarchive.cso.uiuc.edu/pub/etext/gutenberg/;
                 http://www.loc.gov/catdir/description/cam031/2002034805.html;
                 http://www.loc.gov/catdir/samples/cam033/2002034805.html;
                 http://www.loc.gov/catdir/toc/cam031/2002034805.html",
  acknowledgement = ack-nhfb,
  subject =      "Parallel processing (Electronic computers); C++
                 (Computer program language); Data transmission
                 systems",
}

@Article{Kasprzyk:2002:APV,
  author =       "Leszek Kasprzyk and Ryszard Nawrowski and Andrzej
                 Tomczewski",
  title =        "Application of a Parallel Virtual Machine for the
                 Analysis of a Luminous Field",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "122--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740122.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740122.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Keppens:2002:OPM,
  author =       "R. Keppens and G. T{\'o}th",
  title =        "{OpenMP} Parallelism for Multi-dimensional
                 Grid-Adaptive Magnetohydrodynamic Simulations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2329",
  pages =        "940--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:34 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2329.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2329/23290940.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2329/23290940.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Krawezik:2002:SOV,
  author =       "G{\'e}raud Krawezik and Guillaume All{\'e}on and
                 Franck Cappello",
  title =        "{SPMD OpenMP} versus {MPI} on a {IBM SMP} for 3
                 Kernels of the {NAS} Benchmarks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "425--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270425.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270425.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Krysztop:2002:IFP,
  author =       "Bartosz Krysztop and Henryk Krawczyk",
  title =        "Improving Flexibility and Performance of {PVM}
                 Applications by Distributed Partial Evaluation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "376--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740376.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740376.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Langlais:2002:SSM,
  author =       "M. Langlais and G. Latu and J. Roman and P. Silan",
  title =        "Stochastic Simulation of a Marine Host-Parasite System
                 Using a Hybrid {MPI\slash OpenMP} Programming",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2400",
  pages =        "436--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:10:14 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000436.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2400/24000436.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lazzarino:2002:PBP,
  author =       "Oscar Lazzarino and Andrea Sanna and Claudio Zunino
                 and Fabrizio Lamberti",
  title =        "A {PVM}-Based Parallel Implementation of the {REYES}
                 Image Rendering Architecture",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "165--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740165.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740165.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lee:2002:IPC,
  author =       "Nung Kion Lee and David Taniar and J. Wenny Rahayu and
                 Mafruz Zaman Ashrafi",
  title =        "Implementation of Parallel Collection Equi-Join Using
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2367",
  pages =        "217--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:54 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2367.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2367/23670217.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2367/23670217.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lopez:2002:ESM,
  author =       "F{\'e}lix C{\'e}sar Garc{\'\i}a L{\'o}pez and Nieves
                 Luz Fr{\'\i}as Arrocha",
  title =        "Expanding the Synchronization Model for {OpenMP}",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "169--175",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Luecke:2002:DDM,
  author =       "Glenn R. Luecke and Yan Zou and James Coyle and Jim
                 Hoekstra and Marina Kraeva",
  title =        "Deadlock detection in {MPI} programs",
  journal =      j-CCPE,
  volume =       "14",
  number =       "11",
  pages =        "911--932",
  day =          "25",
  month =        aug,
  year =         "2002",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.701",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Nov 9 12:24:19 MST 2002",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/97519209/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=97519209{\&}PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Macias:2002:SEA,
  author =       "Elsa M. Mac{\'\i}as and Alvaro Su{\'a}rez",
  title =        "Solving Engineering Applications with {LAMGAC} over
                 {MPI-2}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "130--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740130.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740130.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mahinthakumar:2002:HMO,
  author =       "G. Mahinthakumar and F. Saied",
  title =        "A Hybrid {MPI-OpenMP} Implementation of an Implicit
                 Finite-Element Code on Parallel Architectures",
  journal =      j-IJHPCA,
  volume =       "16",
  number =       "4",
  pages =        "371--393",
  month =        "Winter",
  year =         "2002",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Marcos:2002:DDP,
  author =       "Carlos de la Fuente Marcos and Pierre Barge and
                 Ra{\'u}l de la Fuente Marcos",
  title =        "Dust Dynamics in Protoplanetary Disks: Parallel
                 Computing with {PVM}",
  journal =      j-J-COMPUT-PHYS,
  volume =       "176",
  number =       "2",
  pages =        "276--294",
  day =          "1",
  month =        mar,
  year =         "2002",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1006/jcph.2001.6978",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Mon Jan 2 22:12:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999101969785",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Marowka:2002:ISI,
  author =       "Ami Marowka",
  title =        "Introduction to the special issue: {OpenMP}:
                 Experiences, Implementations and Applications",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "v--v",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Michailidis:2002:PSL,
  author =       "Panagiotis D. Michailidis and Konstantinos G.
                 Margaritis",
  title =        "A Performance Study of Load Balancing Strategies for
                 Approximate String Matching on an {MPI} Heterogeneous
                 System Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "432--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740432.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740432.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mohr:2002:DPP,
  author =       "Bernd Mohr and Allen D. Malony and Sameer Shende and
                 Felix Wolf",
  title =        "Design and Prototype of a Performance Tool Interface
                 for {OpenMP}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "23",
  number =       "1",
  pages =        "105--128",
  month =        aug,
  year =         "2002",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jan 14 07:25:20 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp008.kluweronline.com/content/getfile/5189/37/8/abstract.htm;
                 http://ipsapp008.kluweronline.com/content/getfile/5189/37/8/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Muller:2002:SMB,
  author =       "Matthias S. M{\"u}ller",
  title =        "A Shared Memory Benchmark in {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "380--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270380.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270380.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nakajima:2002:PISa,
  author =       "Kengo Nakajima and Hiroshi Okuda",
  title =        "Parallel Iterative Solvers for Unstructured Grids
                 Using an {OpenMP\slash MPI} Hybrid Programming Model
                 for the {GeoFEM} Platform on {SMP} Cluster
                 Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "437--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270437.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270437.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nakajima:2002:PISb,
  author =       "Kengo Nakajima and Hiroshi Okuda",
  title =        "Parallel iterative solvers for unstructured grids
                 using a directive\slash {MPI} hybrid programming model
                 for the {GeoFEM} platform on {SMP} cluster
                 architectures",
  journal =      j-CCPE,
  volume =       "14",
  number =       "6--7",
  pages =        "411--429",
  month =        may # "\slash " # jun,
  year =         "2002",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.622",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Nov 9 12:24:19 MST 2002",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/94515747/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=94515747{\&}PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Nakano:2002:SCG,
  author =       "Hirofumi Nakano and Kazuhisa Ishizaka and Motoki Obata
                 and Keiji Kimura and Hironori Kasahara",
  title =        "Static Coarse Grain Task Scheduling with Cache
                 Optimization Using {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "479--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270479.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270479.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Norden:2002:OVM,
  author =       "M. Nord{\'e}n and S. Holmgren and M. Thun{\'e}",
  title =        "{OpenMP} versus {MPI} for {PDE} Solvers Based on
                 Regular Sparse Numerical Operators",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2331",
  pages =        "681--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:36 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2331.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2331/23310681.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2331/23310681.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Ong:2002:MRS,
  author =       "Emil Ong",
  title =        "{MPI Ruby}: Scripting in a Parallel Environment",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "4",
  number =       "4",
  pages =        "78--82",
  month =        jul # "\slash " # aug,
  year =         "2002",
  CODEN =        "CSENFA",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Sat Jan 3 18:25:00 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://csdl.computer.org/comp/mags/cs/2002/04/c4078abs.htm;
                 http://csdl.computer.org/dl/mags/cs/2002/04/c4078.htm;
                 http://csdl.computer.org/dl/mags/cs/2002/04/c4078.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@InProceedings{Phillips:2002:NBS,
  author =       "James C. Phillips and Gengbin Zheng and Sameer Kumar
                 and Laxmikant V. Kal{\'e}",
  title =        "{NAMD}: Biomolecular Simulation on Thousands of
                 Processors",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap277.pdf",
  abstract =     "NAMD is a fully featured, production molecular
                 dynamics program for high performance simulation of
                 large biomolecular systems. We have previously, at
                 SC2000, presented scaling results for simulations with
                 cutoff electrostatics on up to 2048 processors of the
                 ASCI Red machine, achieved with an object-based hybrid
                 force and spatial decomposition scheme and an
                 aggressive measurement-based predictive load balancing
                 framework. We extend this work by demonstrating similar
                 scaling on the much faster processors of the PSC
                 Lemieux Alpha cluster, and for simulations employing
                 efficient (order N log N) particle mesh Ewald full
                 electrostatics. This unprecedented scalability in a
                 biomolecular simulation code has been attained through
                 latency tolerance, adaptation to multiprocessor nodes,
                 and the direct use of the Quadrics Elan library in
                 place of MPI by the Charm++/Converse parallel runtime
                 system.",
  acknowledgement = ack-nhfb,
}

@Article{Piriyakumar:2002:EFI,
  author =       "Douglas Antony Louis Piriyakumar and Paul Levi and
                 Rolf Rabenseifner",
  title =        "Enhanced File Interoperability with Parallel {MPI}
                 File-{I/O} in Image Processing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "174--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740174.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740174.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Plachetka:2002:QTS,
  author =       "Tomas Plachetka",
  title =        "(Quasi-) Thread-Safe {PVM} and (Quasi-) Thread-Safe
                 {MPI} without Active Polling",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "296--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740296.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740296.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Prabhakar:2002:PCB,
  author =       "Achal Prabhakar and Vladimir Getov and Barbara
                 Chapman",
  title =        "Performance Comparisons of Basic {OpenMP} Constructs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "413--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270413.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270413.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Rauber:2002:LSH,
  author =       "Thomas Rauber and Gudula R{\"u}nger",
  title =        "Library Support for Hierarchical Multi-Processor
                 Tasks",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap176.pdf",
  abstract =     "The paper considers the modular programming with
                 hierarchically structured multi-processor tasks on top
                 of SPMD tasks for distributed memory machines. The
                 parallel execution requires a corresponding
                 decomposition of the set of processors into a
                 hierarchical group structure onto which the tasks are
                 mapped. This results in a multi-level group SPMD
                 computation model with varying processor group
                 structures. The advantage of this kind of mixed task
                 and data parallelism is a potential to reduce the
                 communication overhead and to increase scalability. We
                 present a runtime library to support the coordination
                 of hierarchically structured multi-processor tasks. The
                 library exploits an extended parallel group SPMD
                 programming model and manages the entire task execution
                 including the dynamic hierarchy of processor groups.
                 The library is built on top of MPI, has an easy-to-use
                 interface, and leads to only a marginal overhead while
                 allowing static planning and dynamic restructuring.
                 Keywords: mixed task and data parallelism,
                 multiprocessor tasks, multilevel group SPMD,
                 hierarchical decomposition of processor sets, library
                 support, distributed memory",
  acknowledgement = ack-nhfb,
}

@Article{Reussner:2002:SCB,
  author =       "Ralf Reussner and Peter Sanders and Jesper Larsson
                 Tr{\"a}ff",
  title =        "{SKaMPI}: a comprehensive benchmark for public
                 benchmarking of {MPI}",
  journal =      j-SCI-PROG,
  volume =       "10",
  number =       "1",
  pages =        "55--65",
  year =         "2002",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Sat Oct 26 14:52:27 MDT 2002",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=9ejnuvwuvby9737jte27%26referrer=parent%26backto=issue%2C6%2C9%3Bjournal%2C2%2C12%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Sack:2002:FMB,
  author =       "Paul Sack and Anne C. Elster",
  title =        "Fast {MPI} Broadcasts through Reliable Multicasting",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2367",
  pages =        "445--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:54 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2367.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2367/23670445.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2367/23670445.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Selikhov:2002:MCC,
  author =       "Anton Selikhov and George Bosilca and Cecile Germain
                 and Gilles Fedak and Franck Cappello",
  title =        "{MPICH-CM}: a Communication Library Design for a {P2P
                 MPI} Implementation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "323--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740323.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740323.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shires:2002:EHM,
  author =       "D. Shires and R. Mohan",
  title =        "An Evaluation of {HPF} and {MPI} Approaches and
                 Performance in Unstructured Finite Element
                 Simulations",
  journal =      "Journal of Mathematical Modelling and Algorithms",
  volume =       "1",
  number =       "3",
  publisher =    "Kluwer Academic Publishers, Dordrecht, The
                 Netherlands",
  pages =        "153--167",
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1570-1166",
  bibdate =      "Sat Dec 7 09:42:43 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; Ingenta
                 database",
  acknowledgement = ack-nhfb,
  pagecount =    "15",
}

@InProceedings{Sistare:2002:UHP,
  author =       "Steven J. Sistare and Christopher J. Jackson",
  title =        "Ultra-High Performance Communication with {MPI} and
                 the {Sun Fire(\TM)} Link Interconnect",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap142.pdf",
  abstract =     "We present a new low-latency system area network that
                 provides the ultra-high bandwidth needed to fuse a
                 collection of large SMP servers into a capability
                 cluster. The network adapter exports a remote shared
                 memory (RSM) model that supports low latency kernel
                 bypass messaging. The Sun\TM{} MPI library uses the RSM
                 interface to implement a highly efficient
                 memory-to-memory messaging protocol in which the
                 library directly manages buffers and data structures in
                 remote memory. This allows flexible allocation of
                 buffer space to active connections, while avoiding
                 resource contention that could otherwise increase
                 latencies. We discuss the characteristics of the
                 interconnect, describe the MPI protocols, and measure
                 the performance of a number of MPI benchmarks. Our
                 results include MPI inter-node bandwidths of almost 3
                 Gigabytes per second and MPI ping-pong latencies as low
                 as 3.7 microseconds.",
  acknowledgement = ack-nhfb,
  keywords =     "interconnects; kernel bypass; MPI; performance
                 evaluation; remote shared memory; SAN",
}

@Article{Smyk:2002:AMM,
  author =       "Adam Smyk and Marek Tudruj",
  title =        "Application of Mixed {{\em MPI OpenMP\/}} Programming
                 in a Multi {SMP} Cluster Computer",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2328",
  pages =        "288--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Sep 12 08:34:49 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2328.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2328/23280288.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2328/23280288.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Smyk:2002:OMP,
  author =       "Adam Smyk and Marek Tudruj",
  title =        "{\em {OpenMP\/}} / {\em {MPI\/}} Programming in a
                 Multi-cluster System Based on Shared Memory\slash
                 Message Passing Communication",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2326",
  pages =        "241--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2326.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2326/23260241.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2326/23260241.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Stpiczynski:2002:PPO,
  author =       "Przemyslaw Stpiczynski",
  title =        "{Parallel Programming in OpenMP} Helps Novices: a
                 review of {Parallel Programming in OpenMP} by {Rohit
                 Chandra}, {Leonardo Dagum}, {Dave Kohr}, {Dror Maydan},
                 {Jeff McDonald}, and {Ramesh Menon}",
  journal =      j-IEEE-DISTRIB-SYST-ONLINE,
  volume =       "3",
  number =       "8",
  year =         "2002",
  ISSN =         "1541-4922 (print), 1558-1683 (electronic)",
  ISSN-L =       "1541-4922",
  bibdate =      "Wed Oct 23 17:47:56 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dsonline.computer.org/0208/d/bks_a.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Distributed Systems Online",
}

@Article{Takahashi:2002:PEH,
  author =       "Daisuke Takahashi and Mitsuhisa Sato and Taisuke
                 Boku",
  title =        "Performance Evaluation of the {Hitachi SR8000} Using
                 {OpenMP} Benchmarks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "390--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/lncs2002a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270390.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270390.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Thakur:2002:ONA,
  author =       "Rajeev Thakur and William Gropp and Ewing Lusk",
  title =        "Optimizing noncontiguous accesses in {MPI-IO}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "28",
  number =       "1",
  pages =        "83--105",
  month =        jan,
  year =         "2002",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 22 16:52:43 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/35/21/60/27/32/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/60/27/32/00001686.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Tian:2002:IOC,
  author =       "Xinmin Tian and Aart Bik and Milind Girkar and Paul
                 Grey and Hideki Saito and Ernesto Su",
  title =        "{Intel\reg{}} {OpenMP C++\slash Fortran} Compiler for
                 Hyper-Threading Technology: Implementation and
                 Performance",
  journal =      j-INTEL-TECH-J,
  volume =       "6",
  number =       "1",
  pages =        "36--46",
  month =        feb,
  year =         "2002",
  ISSN =         "1535-766X",
  bibdate =      "Thu Feb 28 15:24:21 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/intel-tech-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://developer.intel.com/technology/itj/2002/volume06issue01/vol6iss1_hyper_threading_technology.pdf",
}

@Article{Traff:2002:IMA,
  author =       "Jesper Larsson Tr{\"a}ff",
  title =        "Improved {MPI} All-to-all Communication on a {Giganet
                 SMP} Cluster",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "392--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740392.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740392.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Traff:2002:IMP,
  author =       "Jesper Larsson Traff",
  title =        "Implementing the {MPI} Process Topology Mechanism",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap122.pdf",
  abstract =     "The topology functionality of the Message Passing
                 Interface (MPI) provides a portable,
                 architecture-independent means for adapting application
                 programs to the communication architecture of the
                 target hardware. However, current MPI implementations
                 rarely go beyond the most trivial implementation, and
                 simply performs no process remapping. We discuss the
                 potential of the topology mechanism for systems with a
                 hierarchical communication architecture like clusters
                 of SMP nodes. The MPI topology functionality is a weak
                 mechanism, and we argue about some of its shortcomings.
                 We formulate the topology optimization problem as a
                 graph embedding problem, and show that for hierarchical
                 systems it can be solved by graph partitioning. We
                 state the properties of a new heuristic for solving
                 both the embedding problem and the ``easier'' graph
                 partitioning problem. The graph partitioning based
                 framework has been fully implemented in MPI/SX for the
                 NEC SX-series of parallel vector computers. MPI/SX is
                 thus one of very few MPI implementations with a
                 non-trivial topology functionality. On a 4 node NEC
                 SX-6 significant communication performance improvements
                 are achieved with synthetic MPI benchmarks.",
  acknowledgement = ack-nhfb,
}

@Article{Truong:2002:PAM,
  author =       "Hong-Linh Truong and Thomas Fahringer and Michael
                 Geissler and Georg Madsen",
  title =        "Performance Analysis for {MPI} Applications with
                 {SCALEA}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "421--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740421.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740421.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Uehara:2002:MBP,
  author =       "Hitoshi Uehara and Masanori Tamura and Mitsuo
                 Yokokawa",
  title =        "An {MPI} Benchmark Program Library and Its Application
                 to the {Earth} Simulator",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2327",
  pages =        "219--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2327.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2327/23270219.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2327/23270219.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Vadhiyar:2002:PMS,
  author =       "Sathish S. Vadhiyar and Graham E. Fagg and Jack J.
                 Dongarra",
  title =        "Performance Modeling for Self Adapting Collective
                 Communications for {MPI}",
  crossref =     "Oldehoeft:2002:SIS",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Tue Feb 26 06:44:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/coll-lacsi-2001.pdf",
  acknowledgement = ack-nhfb,
  keywords =     "Los Alamos Computer Science Institute (LASCI)",
  xxbooktitle =  "LACSI Symposium 2001, October 15--18, Eldorado Hotel,
                 Santa Fe, NM",
}

@Article{Vetter:2002:DSP,
  author =       "Jeffrey Vetter",
  title =        "Dynamic statistical profiling of communication
                 activity in distributed applications",
  journal =      j-SIGMETRICS,
  volume =       "30",
  number =       "1",
  pages =        "240--250",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/511334.511364",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:38:22 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Performance analysis of communication activity for a
                 terascale application with traditional message tracing
                 can be overwhelming in terms of overhead, perturbation,
                 and storage. We propose a novel alternative that
                 enables dynamic statistical profiling of an
                 application's communication activity using message
                 sampling. We have implemented an operational prototype,
                 named PHOTON, and our evidence shows that this new
                 approach can provide an accurate, low-overhead,
                 tractable alternative for performance analysis of
                 communication activity. PHOTON consists of two
                 components: a Message Passing Interface (MPI) profiling
                 layer that implements sampling and analysis, and a
                 modified MPI runtime that appends a small but necessary
                 amount of information to individual messages. More
                 importantly, this alternative enables an assortment of
                 runtime analysis techniques so that, in contrast to
                 post-mortem, trace-based techniques, the raw
                 performance data can be jettisoned immediately after
                 analysis. Our investigation shows that message sampling
                 can reduce overhead to imperceptible levels for many
                 applications. Experiments on several applications
                 demonstrate the viability of this approach. For
                 example, with one application, our technique reduced
                 the analysis overhead from 154\% for traditional
                 tracing to 6\% for statistical profiling. We also
                 evaluate different sampling techniques in this
                 framework. The coverage of the sample space provided by
                 purely random sampling is superior to counter- and
                 timer-based sampling. Also, PHOTON's design reveals
                 that frugal modifications to the MPI runtime system
                 could facilitate such techniques on production
                 computing systems, and it suggests that this sampling
                 technique could execute continuously for long-running
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@InProceedings{Vetter:2002:EPE,
  author =       "Jeffrey S. Vetter and Andy Yoo",
  title =        "An Empirical Performance Evaluation of Scalable
                 Scientific Applications",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap222.pdf",
  abstract =     "We investigate the scalability, architectural
                 requirements, and performance characteristics of eight
                 scalable scientific applications. Our analysis is
                 driven by empirical measurements using statistical and
                 tracing instrumentation for both communication and
                 computation. Based on these measurements, we refine our
                 analysis into precise explanations of the factors that
                 influence performance and scalability for each
                 application; we distill these factors into common
                 traits and overall recommendations for both users and
                 designers of scalable platforms. Our experiments
                 demonstrate that some traits, such as improvements in
                 the scaling and performance of MPI's collective
                 operations, will benefit most applications. We also
                 find specific characteristics of some applications that
                 limit performance. For example, one application's
                 intensive use of a 64-bit, floating-point divide
                 instruction, which has high latency and is not
                 pipelined on the POWER3, limits the performance of the
                 application's primary computation.",
  acknowledgement = ack-nhfb,
}

@Article{Wallcraft:2002:CCA,
  author =       "Alan J. Wallcraft",
  title =        "A Comparison of {Co-Array Fortran} and {OpenMP
                 Fortran} for {SPMD} Programming",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "22",
  number =       "3",
  pages =        "231--250",
  month =        jul,
  year =         "2002",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jan 14 07:25:19 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp008.kluweronline.com/content/getfile/5189/36/1/abstract.htm;
                 http://ipsapp008.kluweronline.com/content/getfile/5189/36/1/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Wang:2002:OPG,
  author =       "Ping Wang",
  title =        "{OpenMP} programming for a global inverse model",
  journal =      j-SCI-PROG,
  volume =       "10",
  number =       "3",
  pages =        "253--261",
  year =         "2002",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Sat Oct 26 15:08:19 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Worsch:2002:BCM,
  author =       "Thomas Worsch and Ralf Reussner and Werner Augustin",
  title =        "On Benchmarking Collective {MPI} Operations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "271--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740271.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740271.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Addison:2003:OIA,
  author =       "C. Addison and Y. Ren and M. van Waveren",
  title =        "{OpenMP} issues arising in the development of parallel
                 {BLAS} and {LAPACK} libraries",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "95--104",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@TechReport{Amestoy:2003:IIMa,
  author =       "Patrick R. Amestoy and Iain S. Duff and Jean-Yves
                 L'Excellent and Xiaoye S. Li",
  title =        "Impact of the implementation of {MPI} point-to-point
                 communications on the performance of two general sparse
                 solvers",
  type =         "Report",
  number =       "TR/PA/03/14 and RR-4372 and LBNL-48968 and
                 RT/APO/01/4",
  institution =  inst-CERFACS,
  address =      inst-CERFACS:adr,
  pages =        "????",
  year =         "2003",
  bibdate =      "Tue Jan 03 06:25:11 2006",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/duff-iain-s.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Amestoy:2003:IIMb,
  author =       "Patrick R. Amestoy and Iain S. Duff and Jean-Yves
                 L'Excellent and Xiaoye S. Li",
  title =        "Impact of the implementation of {MPI} point-to-point
                 communications on the performance of two general sparse
                 solvers",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "29",
  number =       "7",
  pages =        "833--849",
  month =        jul,
  year =         "2003",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Dec 24 09:07:26 MST 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Anonymous:2003:MNIc,
  author =       "Anonymous",
  title =        "Micro News: {IBM} ups the ante in silicon transistor
                 speed; New benchmark suite based on high-performance
                 computing applications, {MPI} and {OpenMP} [{SPEC
                 HPC2002}]; {EU} {OKs} {Hitachi}, {Mitsubishi Electric}
                 semiconductor joint venture; {Intel} launches {Pentium
                 4} at {3.06 GHz}; {TSMC} unveils viable 25nm
                 transistors",
  journal =      j-IEEE-MICRO,
  volume =       "23",
  number =       "1",
  pages =        "6--6, 87",
  month =        jan # "\slash " # feb,
  year =         "2003",
  CODEN =        "IEMIDZ",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 23 18:57:10 MDT 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dlib.computer.org/mi/books/mi2003/pdf/m1006.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Barekas:2003:MAO,
  author =       "Vasileios K. Barekas and Panagiotis E. Hadjidoukas and
                 Eleftherios D. Polychronopoulos and others",
  title =        "A Multiprogramming Aware {OpenMP} Implementation",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "133--141",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Bouteiller:2003:MVF,
  author =       "Aurelien Bouteiller and Franck Cappello and Thomas
                 Herault and Geraud Krawezik and Pierre Lemarinier and
                 Frederic Magniette",
  title =        "{MPICH-V2}: a Fault Tolerant {MPI} for Volatile Nodes
                 based on Pessimistic Sender Based Message Logging",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap209.pdf",
  abstract =     "Execution of MPI applications on clusters and Grid
                 deployments suffering from node and network failures
                 motivates the use of fault tolerant MPI
                 implementations. We present MPICH-V2 (the second
                 protocol of MPICHV project), an automatic fault
                 tolerant MPI implementation using an innovative
                 protocol that removes the most limiting factor of the
                 pessimistic message logging approach: reliable logging
                 of in transit messages. MPICH-V2 relies on
                 uncoordinated checkpointing, sender based message
                 logging and remote reliable logging of message logical
                 clocks. This paper presents the architecture of
                 MPICH-V2, its theoretical foundation and the
                 performance of the implementation. We compare MPICH-V2
                 to MPICH-V1 and MPICH-P4 evaluating (a) its
                 point-to-point performance, (b) the performance for the
                 NAS benchmarks, (c) the application performance when
                 many faults occur during the execution. Experimental
                 results demonstrate that MPICH-V2 provides performance
                 close to MPICH-P4 for applications using large messages
                 while reducing dramatically the number of reliable
                 nodes compared to MPICH-V1.",
  acknowledgement = ack-nhfb,
}

@Article{Brightwell:2003:DIP,
  author =       "Ron Brightwell and Rolf Riesen and Arthur B. Maccabe",
  title =        "Design, Implementation, and Performance of {MPI} on
                 {Portals 3.0}",
  journal =      j-IJHPCA,
  volume =       "17",
  number =       "1",
  pages =        "7--20",
  month =        "Spring",
  year =         "2003",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Briguglio:2003:PPM,
  author =       "Sergio Briguglio and Beniamino {Di Martino} and
                 Gregorio Vlad",
  title =        "A performance-prediction model for {PIC} applications
                 on clusters of Symmetric MultiProcessors: Validation
                 with hierarchical {HPF $+$ OpenMP} implementation",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "159--176",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/hpfortran.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Bronevetsky:2003:AAL,
  author =       "Greg Bronevetsky and Daniel Marques and Keshav Pingali
                 and Paul Stodghill",
  title =        "Automated application-level checkpointing of {MPI}
                 programs",
  journal =      j-SIGPLAN,
  pages =        "84--94",
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Carson:2003:CGU,
  author =       "Brett Carson and Robert Murison and Ian A. Mason",
  title =        "Computational Gains Using {RPVM} on a {Beowulf}
                 Cluster",
  journal =      j-R-NEWS,
  volume =       "3",
  number =       "1",
  pages =        "21--26",
  month =        jun,
  year =         "2003",
  CODEN =        "????",
  ISSN =         "1609-3631",
  ISSN-L =       "1609-3631",
  bibdate =      "Thu Aug 13 09:25:10 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/rjournal.bib",
  URL =          "http://CRAN.R-project.org/doc/Rnews/",
  acknowledgement = ack-r-project,
  fjournal =     "R News: the Newsletter of the R Project",
  journal-URL =  "http://journal.r-project.org/",
  pdf =          Rnews2003-1,
}

@Article{Chen:2003:GMD,
  author =       "L. Chen and C. LiWang and F. C. M. Lau",
  title =        "A Grid Middleware for Distributed {Java} Computing
                 with {MPI} Binding and Process Migration Supports",
  journal =      j-J-COMP-SCI-TECH,
  volume =       "18",
  number =       "4",
  pages =        "505--514",
  year =         "2003",
  CODEN =        "JCTEEM",
  ISSN =         "1000-9000",
  ISSN-L =       "1000-9000",
  bibdate =      "Wed Aug 27 05:49:07 MDT 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; Ingenta
                 database",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of computer science and technology",
}

@InProceedings{Coll:2003:SHB,
  author =       "Salvador Coll and Jose Duato and Fabrizio Petrini and
                 Francisco J. Mora",
  title =        "Scalable Hardware-Based Multicast Trees",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#2;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap300.pdf",
  abstract =     "This paper presents an algorithm for implementing
                 optimal hardware-based multicast trees, on networks
                 that provide hardware support for collective
                 communication. Although the proposed methodology can be
                 generalized to a wide class of networks, we apply our
                 methodology to the Quadrics network, a state-of-the-art
                 network that provides hardware-based multicast
                 communication. The proposed mechanism is intended to
                 improve the performance of the collective communication
                 patterns on the network, in those cases where the
                 hardware support can not be directly used, for
                 instance, due to some faulty nodes. This scheme
                 provides significant reduction on multicast latencies
                 compared to the original system primitives, which use
                 multicast trees based on unicast communication. A
                 backtracking algorithm to find the optimal solution to
                 the problem is presented. In addition, a greedy
                 algorithm is presented and shown to provide near
                 optimal solutions. Finally, our experimental results
                 show the good performance and scalability of the
                 proposed multicast tree in comparison to the
                 traditional unicast-based multicast trees. Our
                 multicast mechanism doubles barrier synchronization and
                 broadcasts performance when compared to the
                 production-level MPI library.",
  acknowledgement = ack-nhfb,
}

@Article{Cooperman:2003:UTC,
  author =       "Gene Cooperman and Henri Casanova and Jim Hayes and
                 Thomas Witzel",
  title =        "Using {TOP-C} and {AMPIC} to port large parallel
                 applications to the {Computational Grid}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "19",
  number =       "4",
  pages =        "587--596",
  month =        may,
  year =         "2003",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Jan 10 10:03:33 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  remark =       "Selected papers from the IEEE/ACM International
                 Symposium on Cluster Computing and the Grid,
                 Berlin-Brandenburg Academy of Sciences and Humanities,
                 Berlin, Germany, 21--24 May 2002.",
}

@Article{Czarnul:2003:PTA,
  author =       "Pawel Czarnul",
  title =        "Programming, Tuning and Automatic Parallelization of
                 Irregular Divide-and-Conquer Applications in
                 {DAMPVM\slash DAC}",
  journal =      j-IJHPCA,
  volume =       "17",
  number =       "1",
  pages =        "77--93",
  month =        "Spring",
  year =         "2003",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{DePasquale:2003:UJU,
  author =       "C. J. DePasquale",
  title =        "Using the {JVMPI} to Understand the Behavior of {Java}
                 Classes During the Development Process",
  journal =      "Cmg",
  volume =       "2",
  number =       "??",
  publisher =    "Computer Measurement Group",
  pages =        "821--832",
  year =         "2003",
  CODEN =        "????",
  bibdate =      "Sat Apr 3 08:12:24 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib; Ingenta
                 database",
  acknowledgement = ack-nhfb,
}

@InProceedings{Fernandez:2003:BMN,
  author =       "Juan Fernandez and Eitan Frachtenberg and Fabrizio
                 Petrini",
  title =        "{BCS-MPI}: a New Approach in the System Software
                 Design for Large-Scale Parallel Computers",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap306.pdf",
  abstract =     "Buffered CoScheduled MPI (BCS-MPI) introduces a new
                 approach to design the communication layer for
                 large-scale parallel machines. The emphasis of BCS-MPI
                 is on the global coordination of a large number of
                 communicating processes rather than on the traditional
                 optimization of the point-to-point performance. BCS-MPI
                 delays the interprocessor communication in order to
                 schedule globally the communication pattern and it is
                 designed on top of a minimal set of collective
                 communication primitives. In this paper we describe a
                 prototype implementation of BCS-MPI and its
                 communication protocols. Several experimental results,
                 executed on a set of scientific applications, show that
                 BCS-MPI can compete with a production-level MPI
                 implementation, but is much simpler to implement, debug
                 and model. Keywords: MPI, buffered coscheduling, STORM,
                 Quadrics, system software, communication protocols,
                 cluster computing, large-scale parallel computers.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Gabriel:2003:EPM,
  author =       "Edgar Gabriel and Graham Fagg and Jack Dongarra",
  title =        "Evaluating the Performance of {MPI-2} Dynamic
                 Communicators and One-Sided Communication",
  crossref =     "Dongarra:2003:RAP",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Tue Jan 13 18:15:48 2004",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/netlib/utk/people/JackDongarra/PAPERS/europvm-mpi-2003-mpi2.pdf",
  acknowledgement = ack-nhfb,
}

@InProceedings{Gabriel:2003:FTC,
  author =       "Edgar Gabriel and Graham E. Fagg and Antonin Bukovsky
                 and Thara Angskun and Jack J. Dongarra",
  editor =       "????",
  booktitle =    "{17th Annual ACM International Conference on
                 Supercomputing (ICS'03) International Workshop on Grid
                 Computing and e-Science, June 21, 2003, San
                 Francisco}",
  title =        "A Fault-Tolerant Communication Library for {Grid}
                 Environments",
  publisher =    "????",
  address =      "????",
  pages =        "??--??",
  year =         "2003",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Tue Jan 13 18:14:32 2004",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/netlib/utk/people/JackDongarra/PAPERS/FTMPI-SF-gabriel.pdf",
  acknowledgement = ack-nhfb,
  xxcrossref =   "ACM:2003:CPI",
}

@Article{Gao:2003:LSP,
  author =       "Shiwu Gao",
  title =        "Linear-scaling parallelization of the {WIEN} package
                 with {MPI}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "153",
  number =       "2",
  pages =        "190--198",
  day =          "15",
  month =        jun,
  year =         "2003",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(03)00224-8",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:41:30 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465503002248",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Jin:2003:AMP,
  author =       "Haoqiang Jin and Gabriele Jost and Jerry Yan and
                 others",
  title =        "Automatic multilevel parallelization using {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "177--190",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Karonis:2003:MGG,
  author =       "Nicholas T. Karonis and Brian Toonen and Ian Foster",
  title =        "{MPICH-G2}: a {Grid}-enabled implementation of the
                 {Message Passing Interface}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "63",
  number =       "5",
  pages =        "551--563",
  month =        may,
  year =         "2003",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Tue Dec 16 16:10:41 MST 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Karwande:2003:CMC,
  author =       "Amit Karwande and Xin Yuan and David K. Lowenthal",
  title =        "{CC--MPI}: a compiled communication capable {MPI}
                 prototype for {Ethernet} switched clusters",
  journal =      j-SIGPLAN,
  volume =       "38",
  number =       "10",
  pages =        "95--106",
  month =        oct,
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@InProceedings{Kee:2003:POP,
  author =       "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
  title =        "{ParADE}: An {OpenMP} Programming Environment for
                 {SMP} Cluster Systems",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/linux.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2003.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap130.pdf",
  abstract =     "Demand for programming environments to exploit
                 clusters of symmetric multiprocessors (SMPs) is
                 increasing. In this paper, we present a new programming
                 environment, called ParADE, to enable easy, portable,
                 and high-performance programming on SMP clusters. It is
                 an OpenMP programming environment on top of a
                 multi-threaded software distributed shared memory
                 (SDSM) system with a variant of home-based lazy release
                 consistency protocol. To boost performance, the runtime
                 system provides explicit message-passing primitives to
                 make it a hybrid-programming environment. Collective
                 communication primitives are used for the
                 synchronization and work-sharing directives associated
                 with small data structures, lessening the
                 synchronization overhead and avoiding the implicit
                 barriers of work-sharing directives. The OpenMP
                 translator bridges the gap between the OpenMP
                 abstraction and the hybrid programming interfaces of
                 the runtime system. The experiments with several NAS
                 benchmarks and applications on a Linux-based cluster
                 show promising results that ParADE overcomes the
                 performance problem of the conventional SDSM-based
                 OpenMP environment.",
  acknowledgement = ack-nhfb,
  keywords =     "hybrid programming; MPI; OpenMP; programming
                 environment; SMP cluster; software distributed shared
                 memory",
}

@Article{Keller:2003:TEE,
  author =       "Rainer Keller and Edgar Gabriel and Bettina Krammer
                 and Matthias S. M{\"u}ller and Michael M. Resch",
  title =        "Towards Efficient Execution of {MPI} Applications on
                 the {Grid}: Porting and Optimization Issues",
  journal =      j-J-GRID-COMP,
  volume =       "1",
  number =       "2",
  pages =        "133--149",
  month =        "????",
  year =         "2003",
  CODEN =        "????",
  ISSN =         "1570-7873 (print), 1572-9184 (electronic)",
  ISSN-L =       "1570-7873",
  bibdate =      "Sat Dec 4 11:39:31 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/jrnltoc.htm/1570-7873",
  URL =          "http://ipsapp008.kluweronline.com/IPS/content/ext/x/J/6160/I/4/A/4/abstract.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Grid Computing",
  journal-URL =  "http://link.springer.com/journal/10723",
}

@InProceedings{Komatitsch:2003:BDF,
  author =       "Dimitri Komatitsch and Seiji Tsuboi and Chen Ji and
                 Jeroen Tromp",
  title =        "A 14.6 billion degrees of freedom, 5 teraflops, 2.5
                 terabyte earthquake simulation on the {Earth
                 Simulator}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap124.pdf",
  abstract =     "We use 1944 processors of the Earth Simulator to model
                 seismic wave propagation resulting from large
                 earthquakes. Simulations are conducted based upon the
                 spectral-element method, a high-degree finite-element
                 technique with an exactly diagonal mass matrix. We use
                 a very large mesh with 5.5 billion grid points (14.6
                 billion degrees of freedom). We include the full
                 complexity of the Earth, i.e., a three-dimensional
                 wave-speed and density structure, a 3-D crustal model,
                 ellipticity as well as topography and bathymetry. A
                 total of 2.5 terabytes of memory is needed. Our
                 implementation is purely based upon MPI, with loop
                 vectorization on each processor. We obtain an excellent
                 vectorization ratio of 99.3\%, and we reach a
                 performance of 5 teraflops (30\% of the peak
                 performance) on 38\% of the machine. The very high
                 resolution of the mesh allows us to perform fully
                 three-dimensional calculations at seismic periods as
                 low as 5 seconds.",
  acknowledgement = ack-nhfb,
}

@Article{Kranzlmuller:2003:RAP,
  author =       "Dieter Kranzlm{\"u}ller and Peter Kacsuk and Jack
                 Dongarra and Jens Volkert",
  title =        "Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface (Select papers from the
                 {EuroPVMMPI 2002 Conference})",
  journal =      j-IJHPCA,
  volume =       "17",
  number =       "1",
  pages =        "3--5",
  month =        "Spring",
  year =         "2003",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@InProceedings{Li:2003:PNH,
  author =       "Jianwei Li and Wei-keng Liao and Alok Choudhary and
                 Robert Ross and Rajeev Thakur and William Gropp and Rob
                 Latham and Andrew Siegel and Brad Gallagher and Michael
                 Zingale",
  title =        "{Parallel netCDF}: a High-Performance Scientific {I/O}
                 Interface",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap258.pdf",
  abstract =     "Dataset storage, exchange, and access play a critical
                 role in scientific applications. For such purposes
                 netCDF serves as a portable, efficient file format and
                 programming interface, which is popular in numerous
                 scientific application domains. However, the original
                 interface does not provide an efficient mechanism for
                 parallel data storage and access. In this work, we
                 present a new parallel interface for writing and
                 reading netCDF datasets. This interface is derived with
                 minimal changes from the serial netCDF interface but
                 defines semantics for parallel access and is tailored
                 for high performance. The underlying parallel I/O is
                 achieved through MPI-IO, allowing for substantial
                 performance gains through the use of collective I/O
                 optimizations. We compare the implementation strategies
                 and performance with HDF5. Our tests indicate
                 programming convenience and significant I/O performance
                 improvement with this parallel netCDF (PnetCDF)
                 interface.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Liu:2003:PCM,
  author =       "Jiuxing Liu and Balasubramanian Chandrasekaran and
                 Jiesheng Wu and Weihang Jiang and Sushmitha Kini and
                 Weikuan Yu and Darius Buntinas and Pete Wyckoff and D.
                 K. Panda",
  title =        "Performance Comparison of {MPI} Implementations over
                 {InfiniBand}, {Myrinet} and {Quadrics}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#0;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap310.pdf",
  abstract =     "In this paper, we present a comprehensive performance
                 comparison of MPI implementations over InfiniBand,
                 Myrinet and Quadrics. Our performance evaluation
                 consists of two major parts. The first part consists of
                 a set of MPI level micro-benchmarks that characterize
                 different aspects of MPI implementations. The second
                 part of the performance evaluation consists of
                 application level benchmarks. We have used the NAS
                 Parallel Benchmarks and the sweep3D benchmark. We not
                 only present the overall performance results, but also
                 relate application communication characteristics to the
                 information we acquired from the micro-benchmarks. Our
                 results show that the three MPI implementations all
                 have their advantages and disadvantages. For our 8-node
                 cluster, InfiniBand can offer significant performance
                 improvements for a number of applications compared with
                 Myrinet and Quadrics when using the PCI-X bus. Even
                 with just the PCI bus, InfiniBand can still perform
                 better if the applications are bandwidth-bound.",
  acknowledgement = ack-nhfb,
}

@Article{Luecke:2003:CPM,
  author =       "Glenn R. Luecke and Marina Kraeva and Lili Ju",
  title =        "Comparing the performance of {MPICH} with {Cray}'s
                 {MPI} and with {SGI}'s {MPI}",
  journal =      j-CCPE,
  volume =       "15",
  number =       "9",
  pages =        "779--802",
  day =          "10",
  month =        aug,
  year =         "2003",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.719",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:12 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 Jul 2003",
}

@Article{Luecke:2003:MCT,
  author =       "Glenn Luecke and Hua Chen and James Coyle and Jim
                 Hoekstra and Marina Kraeva and Yan Zou",
  title =        "{MPI-CHECK}: a tool for checking {Fortran 90 MPI}
                 programs",
  journal =      j-CCPE,
  volume =       "15",
  number =       "2",
  pages =        "93--100",
  month =        feb,
  year =         "2003",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.705",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:06 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "6 Jan 2003",
}

@Article{Marowka:2003:EOT,
  author =       "Ami Marowka",
  title =        "Extending {OpenMP} for Task Parallelism",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "13",
  number =       "3",
  pages =        "341--??",
  month =        sep,
  year =         "2003",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Sat Nov 6 18:06:31 MST 2004",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Mattson:2003:HGO,
  author =       "Timothy G. Mattson",
  title =        "How good is {OpenMP}?",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "81--93",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Michailidis:2003:PEL,
  author =       "Panagiotis D. Michailidis and Konstantinos G.
                 Margaritis",
  title =        "Performance evaluation of load balancing strategies
                 for approximate string matching application on an {MPI}
                 cluster of heterogeneous workstations",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "19",
  number =       "7",
  pages =        "1075--1104",
  month =        oct,
  year =         "2003",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Jan 10 10:03:37 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  remark =       "Selected papers on Theoretical and Computational
                 Aspects of Structural Dynamical Systems in Linear
                 Algebra and Control.",
}

@Article{Min:2003:OOP,
  author =       "Seung-Jai Min and Ayon Basumallik and Rudolf
                 Eigenmann",
  title =        "Optimizing {OpenMP} Programs on Software Distributed
                 Shared Memory Systems",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "31",
  number =       "3",
  pages =        "225--249",
  month =        jun,
  year =         "2003",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jan 24 14:51:21 MST 2004",
  bibsource =    "http://www.kluweronline.com/issn/0885-7458;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "/ips/frames/Refs/referenceskapmain.asp?J=4773&I=33&A=5&LK=NM;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/5/abstract.htm;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/5/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@InProceedings{Moody:2003:SNB,
  author =       "Adam Moody and Juan Fernandez and Fabrizio Petrini and
                 Dhabaleswar K. Panda",
  title =        "Scalable {NIC}-based Reduction on Large-Scale
                 Clusters",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#2;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap316.pdf",
  abstract =     "Many parallel algorithms require efficient reduction
                 collectives. In response, researchers have designed
                 algorithms considering a range of parameters including
                 data size, system size, and communication
                 characteristics. Throughout this past work, however,
                 processing was limited to the host CPU. Today, modern
                 Network Interface Cards (NICs) sport programmable
                 processors with substantial memory, and thus introduce
                 a fresh variable into the equation. In this paper, we
                 investigate this new option in the context of
                 large-scale clusters. Through experiments on the
                 960-node, 1920-processor ASCI Linux Cluster (ALC) at
                 Lawrence Livermore National Laboratory, we show that
                 NIC-based reductions outperform host-based algorithms
                 in terms of reduced latency and increased consistency.
                 In particular, in the largest configuration tested ---
                 1812 processors --- our NIC-based algorithm summed
                 single-element vectors of 32-bit integers and 64-bit
                 floating-point numbers in 73 $ \mu $ s and 118 $ \mu $
                 s, respectively. These results represent respective
                 improvements of 121\% and 39\% over the
                 production-level MPI library.",
  acknowledgement = ack-nhfb,
}

@Article{Muller:2003:OCB,
  author =       "Matthias S. M{\"u}ller",
  title =        "An {OpenMP} compiler benchmark",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "125--131",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Nakajima:2003:PIS,
  author =       "Kengo Nakajima",
  title =        "Parallel Iterative Solvers of {GeoFEM} with Selective
                 Blocking Preconditioning for Nonlinear Contact Problems
                 on the {Earth Simulator}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2003.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap155.pdf",
  abstract =     "An efficient parallel iterative method with selective
                 blocking preconditioning has been developed for
                 symmetric multiprocessor (SMP) cluster architectures
                 with vector processors such as the Earth Simulator.
                 This method is based on a three-level hybrid parallel
                 programming model, which includes message passing for
                 inter-SMP node communication, loop directives by OpenMP
                 for intra-SMP node parallelization and vectorization
                 for each processing element (PE). This method provides
                 robust and smooth convergence and excellent vector and
                 parallel performance in 3D geophysical simulations with
                 contact conditions performed on the Earth Simulator.
                 The selective blocking preconditioning is much more
                 efficient than ILU(1) and ILU(2). Performance for the
                 complicated Southwest Japan model with more than 23 M
                 DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was
                 161.7 GFLOPS, corresponding to 25.3\% of the peak
                 performance for hybrid programming model, and 190.4
                 GFLOPS (29.8\% of the peak performance) for flat MPI,
                 respectively.",
  acknowledgement = ack-nhfb,
}

@Article{Nakano:2003:SCG,
  author =       "Hirofumi Nakano and Kazuhisa Ishizaka and Motoki Obata
                 and Keiji Kimura and Hironori Kasahara",
  title =        "Static Coarse Grain Task Scheduling with Cache
                 Optimization Using {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "31",
  number =       "3",
  pages =        "211--223",
  month =        jun,
  year =         "2003",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jan 24 14:51:21 MST 2004",
  bibsource =    "http://www.kluweronline.com/issn/0885-7458;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "/ips/frames/Refs/referenceskapmain.asp?J=4773&I=33&A=4&LK=NM;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/4/abstract.htm;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/4/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Book{Quinn:2003:PPC,
  author =       "Michael J. (Michael Jay) Quinn",
  title =        "Parallel programming in {C} with {MPI} and {OpenMP}",
  publisher =    pub-MCGRAW-HILL,
  address =      pub-MCGRAW-HILL:adr,
  pages =        "xiv + 529",
  year =         "2003",
  ISBN =         "0-07-123265-6, 0-07-282256-2",
  ISBN-13 =      "978-0-07-123265-4, 978-0-07-282256-4",
  LCCN =         "QA76.73.C15 Q55 2003; QA76.73 .C15 Q55 2003",
  bibdate =      "Thu Jun 2 07:26:02 MDT 2005",
  bibsource =    "clavis.ucalgary.ca:2200/UNICORN;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The era of practical parallel programming has arrived,
                 marked by the popularity of the MPI and OpenMP software
                 standards and the emergence of commodity clusters as
                 the hardware platform of choice for an increasing
                 number of organizations. This exciting new book,
                 ``Parallel Programming in C with MPI and OpenMP''
                 addresses the needs of students and professionals who
                 want to learn how to design, analyze, implement, and
                 benchmark parallel programs in C using MPI and/or
                 OpenMP. It introduces a rock-solid design methodology
                 with coverage of the most important MPI functions and
                 OpenMP directives. It also demonstrates, through a wide
                 range of examples, how to develop parallel programs
                 that will execute efficiently on today's parallel
                 platforms.",
  acknowledgement = ack-nhfb,
  subject =      "C (Computer program language); Parallel programming
                 (Computer science)",
  tableofcontents = "Motivation and history \\
                 Parallel architectures \\
                 Parallel algorithm design \\
                 Message-passing programming \\
                 The sieve of Erathosthenes \\
                 Floyd's algorithm \\
                 Performance analysis \\
                 Matrix--vector multiplication \\
                 Document classification \\
                 Monte Carlo methods \\
                 Matrix multiplication \\
                 Solving linear systems \\
                 Finite difference methods \\
                 Sorting \\
                 The Fast Fourier Transform \\
                 Combinatorial search \\
                 Shared-memory programming \\
                 Combining MPI and OpenMP",
}

@Article{Reussner:2003:USD,
  author =       "Ralf H. Reussner",
  title =        "Using {SKaMPI} for developing high-performance {MPI}
                 programs with performance portability",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "19",
  number =       "5",
  pages =        "749--759",
  month =        jul,
  year =         "2003",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Jan 10 10:03:34 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  remark =       "Tools for Program Development and Analysis. Best
                 papers from two Technical Sessions, at ICCS2001, San
                 Francisco, CA, USA, and ICCS2002, Amsterdam, The
                 Netherlands.",
}

@Article{Saito:2003:LSP,
  author =       "Hideki Saito and Greg Gaertner and Wesley Jones and
                 Rudolf Eigenmann and Hidetoshi Iwashita and Ron
                 Lieberman and Matthijs van Waveren and Brian Whitney",
  title =        "Large System Performance of {SPEC OMP} Benchmark
                 Suites",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "31",
  number =       "3",
  pages =        "197--209",
  month =        jun,
  year =         "2003",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jan 24 14:51:21 MST 2004",
  bibsource =    "http://www.kluweronline.com/issn/0885-7458;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "/ips/frames/Refs/referenceskapmain.asp?J=4773&I=33&A=3&LK=NM;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/3/abstract.htm;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/3/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Shires:2003:OPF,
  author =       "Dale Shires and Ram Mohan",
  title =        "Optimization and Performance of a {Fortran 90}
                 {MPI}-Based Unstructured Code on Large-Scale Parallel
                 Systems",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "25",
  number =       "2",
  pages =        "131--141",
  month =        jun,
  year =         "2003",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 16 08:27:09 MST 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp009.kluweronline.com/content/getfile/5189/44/4/abstract.htm;
                 http://ipsapp009.kluweronline.com/content/getfile/5189/44/4/fulltext.pdf",
  abstract =     "The message-passing interface (MPI) has become the
                 standard in achieving effective results when using the
                 message passing paradigm of parallelization. Codes
                 written using MPI are extremely portable and are
                 applicable to both clusters and massively parallel
                 computing platforms. Since MPI uses the single program,
                 multiple data (SPMD) approach to parallelism, good
                 performance requires careful tuning of the serial code
                 as well as careful data and control flow analysis to
                 limit communication. We discuss optimization strategies
                 used and their degree of success to increase
                 performance of an MPI-based unstructured finite element
                 simulation code written in Fortran 90. We discuss
                 performance results based on implementations using
                 several modern massively parallel computing platforms
                 including the SGI Origin 3800, IBM Nighthawk 2 SMP, and
                 Cray T3E-1200.",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Squyres:2003:CAL,
  author =       "Jeffrey M. Squyres",
  title =        "A component architecture for {LAM\slash MPI} (citation
                 only)",
  journal =      j-SIGPLAN,
  pages =        "??--??",
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Takahashi:2003:PEH,
  author =       "Daisuke Takahashi and Mitsuhisa Sato and Taisuke
                 Boku",
  title =        "Performance Evaluation of the {Hitachi SR8000} Using
                 {SPEC OMP2001} Benchmarks",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "31",
  number =       "3",
  pages =        "185--196",
  month =        jun,
  year =         "2003",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jan 24 14:51:21 MST 2004",
  bibsource =    "http://www.kluweronline.com/issn/0885-7458;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "/ips/frames/Refs/referenceskapmain.asp?J=4773&I=33&A=2&LK=NM;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/2/abstract.htm;
                 http://ipsapp007.kluweronline.com/content/getfile/4773/33/2/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@InProceedings{Weatherly:2003:DMS,
  author =       "D. Brent Weatherly and David K. Lowenthal and Mario
                 Nakazawa and Franklin Lowenthal",
  title =        "{Dyn-MPI}: Supporting {MPI} on Non Dedicated
                 Clusters",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap126.pdf",
  abstract =     "Distributing data is a fundamental problem in
                 implementing efficient distributed-memory parallel
                 programs. The problem becomes more difficult in
                 environments where the participating nodes are not
                 dedicated to a parallel application. We are
                 investigating the data distribution problem in non
                 dedicated environments in the context of explicit
                 message-passing programs.\par

                 To address this problem, we have designed and
                 implemented an extension to MPI called Dynamic MPI
                 (Dyn-MPI). The key component of Dyn-MPI is its run-time
                 system, which efficiently and automatically
                 redistributes data on the fly when there are changes in
                 the application or the underlying environment. Dyn-MPI
                 supports efficient memory allocation, precise
                 measurement of system load and computation time, and
                 node removal. Performance results show that programs
                 that use Dyn-MPI execute efficiently in non dedicated
                 environments, including up to almost a three-fold
                 improvement compared to programs that do not
                 redistribute data and a 25\% improvement over standard
                 adaptive load balancing techniques.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Worringen:2003:FPN,
  author =       "Joachim Worringen and Jesper Larson Traff and Hubert
                 Ritzdorf",
  title =        "Fast Parallel Non-Contiguous File Access",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#0;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap319.pdf",
  abstract =     "Many applications of parallel I/O perform
                 non-contiguous file accesses: instead of accessing a
                 single (large) block of data in a file, a number of
                 (smaller) blocks of data scattered throughout the file
                 needs to be accessed in each logical I/O operation.
                 However, only few file system interfaces directly
                 support this kind of non-contiguous file access. In
                 contrast, the most commonly used parallel programming
                 interface, MPI, incorporates a flexible model of
                 parallel I/O through its MPI-IO interface. With MPI-IO,
                 arbitrary non-contiguous file accesses are supported in
                 a uniform fashion by the use of derived MPI datatypes
                 set up by the user to reflect the desired I/O
                 pattern.\par

                 Despite a considerable amount of recent work in this
                 area, current MPI-IO implementations suffer from low
                 performance of such non-contiguous accesses when
                 compared to the performance of the storage system for
                 contiguous accesses. In this paper we analyze an
                 important bottleneck in the efficient handling of
                 non-contiguous access patterns in current
                 implementations of MPIIO. We present a new technique,
                 termed listless I/O, that can be incorporated into
                 MPI-IO implementations like the well-known ROMIO
                 implementation, and completely eliminates this
                 bottleneck. We have implemented the technique in
                 MPI/SX, the MPI implementation for the NEC SX-series of
                 parallel vector computers. Results with a synthetic
                 benchmark and an application kernel show that listless
                 I/O is able to increase the bandwidth for
                 non-contiguous file access by sometimes more than a
                 factor of 500 when compared to the traditional
                 approach.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ying:2003:NPK,
  author =       "Lexing Ying and George Biros and Denis Zorin and
                 Harper Langston",
  title =        "A new parallel kernel-independent fast multipole
                 method",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#2;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap166.pdf",
  abstract =     "We present a new adaptive fast multipole algorithm and
                 its parallel implementation. The algorithm is
                 kernel-independent in the sense that the evaluation of
                 pairwise interactions does not rely on any analytic
                 expansions, but only utilizes kernel evaluations. The
                 new method provides the enabling technology for many
                 important problems in computational science and
                 engineering. Examples include viscous flows, fracture
                 mechanics and screened Coulombic interactions. Our
                 MPI-based parallel implementation logically separates
                 the computation and communication phases to avoid
                 synchronization in the upward and downward computation
                 passes, and thus allows us to fully exploit computation
                 and communication overlapping. We measure isogranular
                 and fixed-size scalability for a variety of kernels on
                 the Pittsburgh Supercomputing Center's TCS-1
                 AlphaServer on up to 3000 processors. We have solved
                 viscous flow problems with up to 2.1 billion unknowns
                 and we have achieved 1.6 Tflops/s peak performance and
                 1.13 Tflops/s sustained performance.",
  acknowledgement = ack-nhfb,
  keywords =     "adaptive algorithms; boundary integral equations; Fast
                 multipole methods; massively parallel computing; N-body
                 problems; viscous flows",
}

@Book{Bisseling:2004:PSC,
  author =       "Rob H. Bisseling",
  title =        "Parallel scientific computation: a structured approach
                 using {BSP} and {MPI}",
  publisher =    pub-OXFORD,
  address =      pub-OXFORD:adr,
  pages =        "xviii + 305",
  year =         "2004",
  ISBN =         "0-19-852939-2",
  ISBN-13 =      "978-0-19-852939-2",
  LCCN =         "QA76.58 .B57 2004",
  bibdate =      "Tue Mar 13 14:00:12 MDT 2007",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  URL =          "http://www.loc.gov/catdir/enhancements/fy0617/2004046141-d.html;
                 http://www.loc.gov/catdir/enhancements/fy0617/2004046141-t.html",
  acknowledgement = ack-nhfb,
  subject =      "Bulk Synchronous Parallel (BSP) model; Message Passing
                 Interface (MPI); Parallel processing (Electronic
                 computers); Scientific applications; Supercomputers;
                 Parallel computers",
}

@Article{Boeres:2004:ETF,
  author =       "Cristina Boeres and Vinod E. F. Rebello",
  title =        "{EasyGrid}: towards a framework for the automatic
                 {Grid} enabling of legacy {MPI} applications",
  journal =      j-CCPE,
  volume =       "16",
  number =       "5",
  pages =        "425--432",
  day =          "25",
  month =        apr,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.821",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:53 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "26 Mar 2004",
}

@Article{Corbalan:2004:PMD,
  author =       "Julita Corbalan and Xavier Martorell and Jesus
                 Labarta",
  title =        "Page Migration with Dynamic Space-Sharing Scheduling
                 Policies: The Case of the {SGI O2000}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "32",
  number =       "4",
  pages =        "263--288",
  month =        aug,
  year =         "2004",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1023/B:IJPP.0000035815.13969.ec",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:14 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=32&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=32&issue=4&spage=263",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "CC-NUMA; dynamic processor allocation policy; memory
                 page migration; multiprogrammed workload; OpenMP",
}

@Article{Cotronis:2004:CMP,
  author =       "Yiannis Cotronis",
  title =        "Composition of {Message Passing Interface}
                 Applications over {MPICH-G2}",
  journal =      j-IJHPCA,
  volume =       "18",
  number =       "3",
  pages =        "327--339",
  month =        "Fall",
  year =         "2004",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342004046047",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/18/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/18/3/327.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Fagg:2004:BUF,
  author =       "Graham E. Fagg and Jack J. Dongarra",
  title =        "Building and Using a Fault-Tolerant {MPI}
                 Implementation",
  journal =      j-IJHPCA,
  volume =       "18",
  number =       "3",
  pages =        "353--361",
  month =        "Fall",
  year =         "2004",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342004046052",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/18/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/18/3/353.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Book{Fernando:2004:GGP,
  editor =       "Randima Fernando",
  title =        "{GPU} gems: programming techniques, tips, and tricks
                 for real-time graphics",
  volume =       "1",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "xvv + 765",
  year =         "2004",
  ISBN =         "0-321-22832-4",
  ISBN-13 =      "978-0-321-22832-1",
  LCCN =         "T385 .G6879 2004",
  bibdate =      "Thu Jul 29 13:36:54 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib;
                 z3950.loc.gov:7090/Voyager",
  price =        "US\$45.99",
  series =       "GPU gems",
  acknowledgement = ack-nhfb,
  keywords =     "CUDA; nVIDIA",
  subject =      "Computer graphics; Real-time programming",
}

@Article{Gropp:2004:FTM,
  author =       "William Gropp and Ewing Lusk",
  title =        "Fault Tolerance in {Message Passing Interface}
                 Programs",
  journal =      j-IJHPCA,
  volume =       "18",
  number =       "3",
  pages =        "363--372",
  month =        "Fall",
  year =         "2004",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342004046045",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/18/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/18/3/363.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Iwasaki:2004:NPS,
  author =       "Hideya Iwasaki and Zhenjiang Hu",
  title =        "A New Parallel Skeleton for General Accumulative
                 Computations",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "32",
  number =       "5",
  pages =        "389--414",
  month =        oct,
  year =         "2004",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1023/B:IJPP.0000038069.80050.74",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:18 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=32&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=32&issue=5&spage=389",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Bird data parallel skeleton; Meertens formalism; MPI;
                 program transformation; Skeletal parallel programming",
}

@InProceedings{Ke:2004:RCM,
  author =       "Jian Ke and Martin Burtscher and Evan Speight",
  title =        "Runtime Compression of {MPI} Messages to Improve the
                 Performance and Scalability of Parallel Applications",
  crossref =     "ACM:2004:SHP",
  pages =        "59--59",
  year =         "2004",
  bibdate =      "Tue Dec 27 07:57:20 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Kepner:2004:M,
  author =       "Jeremy Kepner and Stan Ahalt",
  title =        "{MatlabMPI}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "64",
  number =       "8",
  pages =        "997--1005",
  month =        aug,
  year =         "2004",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2004.03.018",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Dec 4 15:15:10 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  abstract =     "In many projects the true costs of high performance
                 computing are currently dominated by software.
                 Addressing these costs may require shifting to higher
                 level languages such as Matlab. MatlabMPI is a Matlab
                 implementation of the Message Passing Interface (MPI)
                 standard and allows any Matlab program to exploit
                 multiple processors. MatlabMPI currently implements the
                 basic six functions that are the core of the MPI
                 point-to-point communications standard. The key
                 technical innovation of MatlabMPI is that it implements
                 the widely used MPI ``look and feel'' on top of
                 standard Matlab file I/O, resulting in an extremely
                 compact ($ \approx 350 $ lines of code) and ``pure''
                 implementation which runs anywhere Matlab runs, and on
                 any heterogeneous combination of computers. The
                 performance has been tested on both shared and
                 distributed memory parallel computers (e.g. Sun, SGI,
                 HP, IBM, Linux, MacOSX and Windows). MatlabMPI can
                 match the bandwidth of C based MPI at large message
                 sizes. A test image filtering application using
                 MatlabMPI achieved a speedup of $ \approx 300 $ using
                 304 CPUs and $ \approx 15 \% $ of the theoretical peak
                 (450 Gigaflops) on an IBM SP2 at the Maui High
                 Performance Computing Center. In addition, this entire
                 parallel benchmark application was implemented in 70
                 software-lines-of-code, illustrating the high
                 productivity of this approach. MatlabMPI is available
                 for download on the web (www.ll.mit.edu/MatlabMPI).",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Book{Ladd:2004:GPP,
  author =       "Scott Ladd",
  title =        "Guide to Parallel Programming",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "465 (est.)",
  year =         "2004",
  ISBN =         "0-387-40577-1",
  ISBN-13 =      "978-0-387-40577-3",
  LCCN =         "????",
  bibdate =      "Wed Aug 27 06:31:34 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Includes CD-ROM.",
  acknowledgement = ack-nhfb,
  tableofcontents = "ntroduction; Supercomputing; Tools for Parallel
                 Programming; Introducing OpenMP; Parallel Loops with
                 Open MP; Advanced OpenMP; Message passing with MPI;
                 Deeper MPI; Design of data and algorithms;
                 Optimization; Debugging the hydra; Parallel in
                 parallel--MPI and OpenMP together; Elaborations;
                 Resources; Index",
}

@InProceedings{Liu:2004:BMI,
  author =       "Jiuxing Liu and Abhinav Vishnu and Dhabaleswar K.
                 Panda",
  title =        "Building Multirail {InfiniBand} Clusters: {MPI}-Level
                 Design and Performance Evaluation",
  crossref =     "ACM:2004:SHP",
  pages =        "33--33",
  year =         "2004",
  bibdate =      "Tue Dec 27 07:57:20 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Liu:2004:HPR,
  author =       "Jiuxing Liu and Jiesheng Wu and Dhabaleswar K. Panda",
  title =        "High Performance {RDMA}-Based {MPI} Implementation
                 over {InfiniBand}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "32",
  number =       "3",
  pages =        "167--198",
  month =        jun,
  year =         "2004",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1023/B:IJPP.0000029272.69895.c1",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 6 16:40:03 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=32&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=32&issue=3&spage=167",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@InProceedings{Lu:2004:AFS,
  author =       "Charng-da Lu and Daniel A. Reed",
  title =        "Assessing Fault Sensitivity in {MPI} Applications",
  crossref =     "ACM:2004:SHP",
  pages =        "37--37",
  year =         "2004",
  bibdate =      "Tue Dec 27 07:57:20 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Luecke:2004:PSM,
  author =       "Glenn R. Luecke and Marina Kraeva and Jing Yuan and
                 Silvia Spanoyannis",
  title =        "Performance and scalability of {MPI} on {PC}
                 clusters",
  journal =      j-CCPE,
  volume =       "16",
  number =       "1",
  pages =        "79--107",
  month =        jan,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.749",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:19 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "4 Dec 2003",
}

@Article{Luecke:2004:PSS,
  author =       "Glenn R. Luecke and Silvia Spanoyannis and Marina
                 Kraeva",
  title =        "The performance and scalability of {SHMEM} and {MPI-2}
                 one-sided routines on a {SGI Origin 2000} and a {Cray
                 T3E-600}",
  journal =      j-CCPE,
  volume =       "16",
  number =       "10",
  pages =        "1037--1060",
  day =          "25",
  month =        aug,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.796",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:55 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "10 Jun 2004",
}

@Article{Marowka:2004:OOA,
  author =       "Ami Marowka and Zhenying Liu and Barbara Chapman",
  title =        "{OpenMP-oriented} applications for distributed shared
                 memory architectures",
  journal =      j-CCPE,
  volume =       "16",
  number =       "4",
  pages =        "371--384",
  day =          "10",
  month =        apr,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.752",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:53 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "19 Jan 2004",
}

@Article{Martin:2004:HPA,
  author =       "Mar{\'\i}a J. Mart{\'\i}n and Marta Parada and
                 Ram{\'o}n Doallo",
  title =        "High Performance Air Pollution Simulation Using
                 {OpenMP}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "28",
  number =       "3",
  pages =        "311--321",
  month =        jun,
  year =         "2004",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Dec 4 12:39:13 MST 2004",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp008.kluweronline.com/IPS/content/ext/x/J/5189/I/54/A/5/abstract.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Book{Mertens:2004:CCP,
  author =       "Stephan Mertens and Alexander Schinner",
  title =        "{Cluster Computing: Praktische Einf{\"u}hrung in das
                 wissenschaftliche Rechnen auf Workstation-Clustern}",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "300 (est.)",
  year =         "2004",
  ISBN =         "3-540-42299-4",
  ISBN-13 =      "978-3-540-42299-0",
  LCCN =         "????",
  bibdate =      "Wed Aug 27 06:33:33 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Includes CD-ROM.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Mohror:2004:PTS,
  author =       "Kathryn Mohror and Karen L. Karavanic",
  title =        "Performance Tool Support for {MPI-2} on {Linux}",
  crossref =     "ACM:2004:SHP",
  pages =        "28--28",
  year =         "2004",
  bibdate =      "Tue Dec 27 07:57:20 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Park:2004:DID,
  author =       "K.-L. Park and H.-J. Lee and O.-Y. Kwon and S.-Y. Park
                 and H.-W. Park and S.-D. Kim",
  title =        "Design and Implementation of a Dynamic Communication
                 {MPI} Library for the Grid",
  journal =      j-INT-J-COMPUT-APPL,
  volume =       "26",
  number =       "3",
  pages =        "1--8",
  year =         "2004",
  DOI =          "https://doi.org/10.1080/1206212X.2004.11441738",
  ISSN =         "1206-212X (print), 1925-7074 (electronic)",
  ISSN-L =       "1206-212X",
  bibdate =      "Sat Apr 21 17:21:44 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijca.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.tandfonline.com/doi/full/10.1080/1206212X.2004.11441738",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Computer Applications",
  journal-URL =  "https://www.tandfonline.com/loi/tjca20",
  online-date =  "11 Jul 2015",
}

@InProceedings{Schulz:2004:IES,
  author =       "Martin Schulz and Greg Bronevetsky and Rohit Fernandes
                 and Daniel Marques and Keshav Pingali and Paul
                 Stodghill",
  title =        "Implementation and Evaluation of a Scalable
                 Application-Level Checkpoint-Recovery Scheme for {MPI}
                 Programs",
  crossref =     "ACM:2004:SHP",
  pages =        "38--38",
  year =         "2004",
  bibdate =      "Tue Dec 27 07:57:20 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Sievert:2004:SMP,
  author =       "Otto Sievert and Henri Casanova",
  title =        "A Simple {MPI} Process Swapping Architecture for
                 Iterative Applications",
  journal =      j-IJHPCA,
  volume =       "18",
  number =       "3",
  pages =        "341--352",
  month =        "Fall",
  year =         "2004",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342004047430",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/18/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/18/3/341.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Skjellum:2004:RTM,
  author =       "Anthony Skjellum and Arkady Kanevsky and Yoginder S.
                 Dandass and Jerrell Watts and Steve Paavola and Dennis
                 Cottel and Greg Henley and L. Shane Hebert and Zhenqian
                 Cui and Anna Rounbehler and {The Real-Time Message
                 Passing Interface (Mpi and Rt) Forum}",
  title =        "The {Real-Time Message Passing Interface Standard
                 (MPI\slash RT-1.1)}",
  journal =      j-CCPE,
  volume =       "16",
  number =       "S1",
  pages =        "Si--S322",
  day =          "25",
  month =        dec,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.744",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:56 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "22 Nov 2004",
}

@Article{Smith:2004:SIP,
  author =       "Kevin B. Smith and Aart J. C. Bik and Xinmin Tian",
  title =        "Support for the {Intel{\reg} Pentium{\reg} 4}
                 Processor with Hyper-Threading Technology in
                 {Intel{\reg}} 8.0 Compilers",
  journal =      j-INTEL-TECH-J,
  volume =       "8",
  number =       "1",
  pages =        "19--31",
  month =        feb,
  year =         "2004",
  ISSN =         "1535-766X",
  bibdate =      "Mon Jul 11 08:46:53 2005",
  bibsource =    "http://developer.intel.com/technology/itj/archive/2004.htm;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://developer.intel.com/technology/itj/2004/volume08issue01/art02_compilers/p01_abstract.htm",
  acknowledgement = ack-nhfb,
  keywords =     "Compilers; Hyper-Threading Technology; Intel Pentium 4
                 processor; OpenMP; Optimization; Vectorization",
}

@Article{Vrenios:2004:PPC,
  author =       "A. Vrenios",
  title =        "{Parallel Programming in C with MPI and OpenMP} [Book
                 Review]",
  journal =      j-IEEE-DISTRIB-SYST-ONLINE,
  volume =       "5",
  number =       "1",
  pages =        "7.1--7.3",
  month =        "????",
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1541-4922 (print), 1558-1683 (electronic)",
  ISSN-L =       "1541-4922",
  bibdate =      "Fri Jul 15 17:50:13 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ieeexplore.ieee.org/iel5/8968/28452/01270716.pdf?isnumber=28452&prod=JNL&arnumber=1270716&arSt=+7.1&ared=+7.3&arAuthor=Vrenios%2C+A.;
                 http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28452&arnumber=1270716&count=8&index=5",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Distributed Systems Online",
}

@Book{White:2004:CMM,
  author =       "R. E. (Robert E.) White",
  title =        "Computational Mathematics: Models, Methods, and
                 Analysis with {MATLAB} and {MPI}",
  publisher =    pub-CHAPMAN-HALL-CRC,
  address =      pub-CHAPMAN-HALL-CRC:adr,
  pages =        "xvi + 385",
  year =         "2004",
  ISBN =         "1-58488-364-2",
  ISBN-13 =      "978-1-58488-364-7",
  LCCN =         "QA297 .W495 2004",
  bibdate =      "Tue Apr 26 09:31:54 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  acknowledgement = ack-nhfb,
  subject =      "Numerical analysis; MATLAB; Computer interfaces;
                 Parallel programming (Computer science)",
}

@Article{Zeyao:2004:AMI,
  author =       "Mo Zeyao and Huang Zhengfeng",
  title =        "Application of {MPI-IO} in Parallel Particle Transport
                 {Monte--Carlo} Simulation",
  journal =      j-PARALLEL-ALGORITHMS-APPL,
  volume =       "19",
  number =       "4",
  pages =        "227--236",
  month =        "????",
  year =         "2004",
  CODEN =        "PAAPEC",
  DOI =          "https://doi.org/10.1080/10637190412331295166",
  ISSN =         "1063-7192",
  ISSN-L =       "1026-7689",
  bibdate =      "Thu Jul 10 21:46:37 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.informaworld.com/smpp/content~content=a714592658",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Algorithms and Applications",
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
}

@Article{Zhang:2004:PMV,
  author =       "Xin Zhang and Lingli Ding and Elke A. Rundensteiner",
  title =        "Parallel multisource view maintenance",
  journal =      j-VLDB-J,
  volume =       "13",
  number =       "1",
  pages =        "22--48",
  month =        jan,
  year =         "2004",
  CODEN =        "VLDBFR",
  DOI =          "https://doi.org/10.1007/s00778-003-0086-0",
  ISSN =         "1066-8888 (print), 0949-877X (electronic)",
  ISSN-L =       "1066-8888",
  bibdate =      "Mon Jun 23 10:51:09 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "In a distributed environment, materialized views are
                 used to integrate data from different information
                 sources and then store them in some centralized
                 location. In order to maintain such materialized views,
                 maintenance queries need to be sent to information
                 sources by the data warehouse management system. Due to
                 the independence of the information sources and the
                 data warehouse, concurrency issues are raised between
                 the maintenance queries and the local update
                 transactions at each information source. Recent
                 solutions such as ECA and Strobe tackle such concurrent
                 maintenance, however with the requirement of quiescence
                 of the information sources. SWEEP and POSSE overcome
                 this limitation by decomposing the global maintenance
                 query into smaller subqueries to be sent to every
                 information source and then performing conflict
                 correction locally at the data warehouse. Note that all
                 these previous approaches handle the data updates {\em
                 one at a time}. Hence either some of the information
                 sources or the data warehouse is likely to be idle
                 during most of the maintenance process. In this paper,
                 we propose that a set of updates should be maintained
                 in parallel by several concurrent maintenance processes
                 so that both the information sources as well as the
                 warehouse would be utilized more fully throughout the
                 maintenance process. This parallelism should then
                 improve the overall maintenance performance. For this
                 we have developed a parallel view maintenance
                 algorithm, called PVM, that substantially improves upon
                 the performance of previous maintenance approaches by
                 handling a set of data updates at the same time. The
                 parallel handling of a set of updates is orthogonal to
                 the particular maintenance algorithm applied to the
                 handling of each individual update. In order to perform
                 parallel view maintenance, we have identified two
                 critical issues that must be overcome: (1) detecting
                 maintenance-concurrent data updates in a parallel mode
                 and (2) correcting the problem that the data warehouse
                 commit order may not correspond to the data warehouse
                 update processing order due to parallel maintenance
                 handling. In this work, we provide solutions to both
                 issues. For the former, we insert a middle-layer
                 timestamp assignment module for detecting
                 maintenance-concurrent data updates without requiring
                 any global clock synchronization. For the latter, we
                 introduce the negative counter concept to solve the
                 problem of variant orders of committing effects of data
                 updates to the data warehouse. We provide a proof of
                 the correctness of PVM that guarantees that our
                 strategy indeed generates the correct final data
                 warehouse state. We have implemented both SWEEP and PVM
                 in our EVE data warehousing system. Our performance
                 study demonstrates that a manyfold performance
                 improvement is achieved by PVM over SWEEP.",
  acknowledgement = ack-nhfb,
  fjournal =     "VLDB Journal: Very Large Data Bases",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J869",
  keywords =     "concurrent data updates; data warehousing; parallel
                 view maintenance; performance evaluation",
}

@Article{Almasi:2005:DIM,
  author =       "G. Alm{\'a}si and C. Archer and J. G. Casta{\~n}os and
                 J. A. Gunnels and C. C. Erway and P. Heidelberger and
                 X. Martorell and J. E. Moreira and K. Pinnow and J.
                 Ratterman and B. D. Steinmacher-Burow and W. Gropp and
                 B. Toonen",
  title =        "Design and implementation of message-passing services
                 for the {Blue Gene/L} supercomputer",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "2/3",
  pages =        "393--406",
  month =        "????",
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 1 08:14:41 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/492/almasi.pdf",
  abstract =     "The Blue Gene/L (BG/L) supercomputer, with 65,536
                 dual-processor compute nodes, was designed from the
                 ground up to support efficient execution of massively
                 parallel message-passing programs. Part of this support
                 is an optimized implementation of the Message Passing
                 Interface (MPI), which leverages the hardware features
                 of BG/L. MPI for BG/L is implemented on top of a more
                 basic message-passing infrastructure called the message
                 layer. This message layer can be used both to implement
                 other higher-level libraries and directly by
                 applications. MPI and the message layer are used in the
                 two BG/L modes of operation: the coprocessor mode and
                 the virtual node mode. Performance measurements show
                 that our message-passing services deliver performance
                 close to the hardware limits of the machine. They also
                 show that dedicating one of the processors of a node to
                 communication functions (coprocessor mode) greatly
                 improves the message-passing bandwidth, whereas running
                 two processes per compute node (virtual node mode) can
                 have a positive impact on application performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0240",
}

@Article{Aversa:2005:HDS,
  author =       "Rocco Aversa and Beniamino {Di Martino} and Nicola
                 Mazzocca and Salvatore Venticinque",
  title =        "A hierarchical distributed-shared memory parallel
                 {Branch \& Bound} application with {PVM} and {OpenMP}
                 for multiprocessor clusters",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1034--1047",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Aversa:2005:PPT,
  author =       "Rocco Aversa and Beniamino {Di Martino} and
                 Massimiliano Rak and Salvatore Venticinque and Umberto
                 Villano",
  title =        "Performance prediction through simulation of a hybrid
                 {MPI\slash OpenMP} application",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1013--1033",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Bernaschi:2005:ERA,
  author =       "Massimo Bernaschi and Giulio Iannello and Saverio
                 Crea",
  title =        "Experimental Results About {MPI} Collective
                 Communication Operations",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "15",
  number =       "1/2",
  pages =        "223--236",
  month =        mar # "\slash " # jun,
  year =         "2005",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626405002179",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Bhanot:2005:OTL,
  author =       "G. Bhanot and A. Gara and P. Heidelberger and E.
                 Lawless and J. C. Sexton and R. Walkup",
  title =        "Optimizing task layout on the {Blue Gene/L}
                 supercomputer",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "2/3",
  pages =        "489--500",
  month =        "????",
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 1 08:14:41 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/492/bhanot.pdf",
  abstract =     "A general method for optimizing problem layout on the
                 Blue Gene/L (BG/L) supercomputer is described. The
                 method takes as input the communication matrix of an
                 arbitrary problem as an array with entries $ C(i, j) $,
                 which represents the data communicated from domain $i$
                 to domain $j$. Given $ C(i, j) $, we implement a
                 heuristic map that attempts to sequentially map a
                 domain and its communication neighbors either to the
                 same BG/L node or to near-neighbor nodes on the BG/L
                 torus, while keeping the number of domains mapped to a
                 BG/L node constant. We then generate a Markov chain of
                 maps using Monte Carlo simulation with free energy $ F
                 = \sum_{i, j} C(i, j)H(i, j) $, where $ H(i, j) $ is
                 the smallest number of hops on the BG/L torus between
                 domain $i$ and domain $j$. For two large parallel
                 applications, SAGE and UMT2000, the method was tested
                 against the default Message Passing Interface rank
                 order layout on up to 2,048 BG/L nodes. It produced
                 maps that improved communication efficiency by up to
                 45\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0240",
}

@Article{Blikberg:2005:LBO,
  author =       "R. Blikberg and T. S{\o}revik",
  title =        "Load balancing and {OpenMP} implementation of nested
                 parallelism",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "984--998",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Brightwell:2005:AIO,
  author =       "Ron Brightwell and Rolf Riesen and Keith D.
                 Underwood",
  title =        "Analyzing the Impact of Overlap, Offload, and
                 Independent Progress for {Message Passing Interface}
                 Applications",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "2",
  pages =        "103--117",
  month =        "Summer",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005054257",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/2/103.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Chan:2005:CCI,
  author =       "Albert Chan and Frank Dehne and Ryan Taylor",
  title =        "{CGMGRAPH\slash CGMLIB}: Implementing and Testing
                 {CGM} Graph Algorithms on {PC} Clusters and Shared
                 Memory Machines",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "1",
  pages =        "81--97",
  month =        "Spring",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005051196",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/1/81.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Chapman:2005:O,
  author =       "Barbara M. Chapman and Federico Massaioli",
  title =        "{OpenMP}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "957--959",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dalcin:2005:MP,
  author =       "Lisandro Dalc{\'\i}n and Rodrigo Paz and Mario
                 Storti",
  title =        "{MPI} for {Python}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "65",
  number =       "9",
  pages =        "1108--1115",
  month =        sep,
  year =         "2005",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:33 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Duran:2005:RAP,
  author =       "A. Duran and R. Silvera and J. Corbalan and J.
                 Labarta",
  booktitle =    "Shared Memory Parallel Programming with {OpenMP}",
  title =        "Runtime Adjustment of Parallel Nested Loops",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "3349",
  pages =        "137--??",
  year =         "2005",
  bibdate =      "Mon Oct 07 09:29:01 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Eleftheriou:2005:SFF,
  author =       "M. Eleftheriou and B. G. Fitch and A. Rayshubskiy and
                 T. J. C. Ward and R. S. Germain",
  title =        "Scalable framework for {$3$D} {FFTs} on the {Blue
                 Gene/L} supercomputer: Implementation and early
                 performance measurements",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "2/3",
  pages =        "457--464",
  month =        "????",
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 1 08:14:41 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/492/eleftheriou.pdf",
  abstract =     "This paper presents results on a
                 communications-intensive kernel, the three-dimensional
                 fast Fourier transform (3D FFT), running on the
                 2,048-node Blue Gene/L (BG/L) prototype. Two
                 implementations of the volumetric FFT algorithm were
                 characterized, one built on the Message Passing
                 Interface library and another built on an active packet
                 Application Program Interface supported by the hardware
                 bring-up environment, the BG/L advanced diagnostics
                 environment. Preliminary performance experiments on the
                 BG/L prototype indicate that both of our
                 implementations scale well up to 1,024 nodes for $3$D
                 FFTs of size $ 128 \time 128 \times 128 $. The
                 performance of the volumetric FFT is also compared with
                 that of the Fastest Fourier Transform in the West
                 (FFTW) library. In general, the volumetric FFT
                 outperforms a port of the FFTW Version 2.1.5 library on
                 large-node-count partitions.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0240",
}

@Article{Florez:2005:LMM,
  author =       "German Florez and Zhen Liu and Susan M. Bridges and
                 Anthony Skjellum and Rayford B. Vaughn",
  title =        "Lightweight monitoring of {MPI} programs in real
                 time",
  journal =      j-CCPE,
  volume =       "17",
  number =       "13",
  pages =        "1547--1578",
  month =        nov,
  year =         "2005",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.889",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Oct 4 06:07:02 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "24 Jun 2005",
}

@Article{Floros:2005:TGS,
  author =       "Evangelos Floros and Yiannis Cotronis",
  title =        "Towards a {Grid} Services Based Framework for the
                 Virtualization, Execution and Composition of {MPI}
                 Applications",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "15",
  number =       "1/2",
  pages =        "85--98",
  month =        mar # "\slash " # jun,
  year =         "2005",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626405002076",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Gabriel:2005:EDC,
  author =       "Edgar Gabriel and Graham E. Fagg and Jack J.
                 Dongarra",
  title =        "Evaluating Dynamic Communicators and One-Sided
                 Operations for Current {MPI} Libraries",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "1",
  pages =        "67--79",
  month =        "Spring",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005051197",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/1/67.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Grove:2005:CBP,
  author =       "D. A. Grove and P. D. Coddington",
  title =        "Communication Benchmarking and Performance Modelling
                 of {MPI} Programs on Cluster Computers",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "34",
  number =       "2",
  pages =        "201--217",
  month =        nov,
  year =         "2005",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-005-2340-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:26 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=34&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=34&issue=2&spage=201",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "cluster computing; parallel computing; performance
                 modelling",
}

@Article{Hadjidoukas:2005:OEM,
  author =       "P. E. Hadjidoukas and T. S. Papatheodorou",
  title =        "{OpenMP} extensions for master-slave message passing
                 computing",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1155--1167",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Huang:2005:TME,
  author =       "Lei Huang and Barbara Chapman and Zhenying Liu",
  title =        "Towards a more efficient implementation of {OpenMP}
                 for clusters via translation to global arrays",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1114--1139",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Hurwitz:2005:AMP,
  author =       "Justin (Gus) Hurwitz and Wu-chun Feng",
  title =        "Analyzing {MPI} performance over 10-Gigabit
                 {Ethernet}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "65",
  number =       "10",
  pages =        "1253--1260",
  month =        oct,
  year =         "2005",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:34 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Ierotheou:2005:GOC,
  author =       "C. S. Ierotheou and H. Jin and G. Matthews and S. P.
                 Johnson and R. Hood",
  title =        "Generating {OpenMP} code using an interactive
                 parallelization environment",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "999--1012",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Izaguirre:2005:PMS,
  author =       "Jes{\'u}s A. Izaguirre and Scott S. Hampton and
                 Thierry Matthey",
  title =        "Parallel multigrid summation for the {$N$}-body
                 problem",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "65",
  number =       "8",
  pages =        "949--962",
  month =        aug,
  year =         "2005",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:33 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  abstract =     "An $ \Theta (n) $ parallel multigrid summation method
                 (MG) for the N-body problem is presented. The method
                 was originally devised for vacuum boundary conditions.
                 Here, it is extended to periodic boundary conditions
                 and implemented in parallel using force decomposition
                 and MPI. MG is based on a hierarchical decomposition of
                 computational kernels on multiple grids. For low
                 accuracy calculations, appropriate for molecular
                 dynamics, a sequential implementation is as fast or
                 faster than particle mesh Ewald (PME). Our parallel
                 implementation is more scalable than PME. The method
                 can be combined with multiple time stepping integrators
                 to produce a powerful simulation protocol for
                 simulation of biological molecules and other materials.
                 The parallel implementation is tested on both a Linux
                 cluster with Myrinet interconnect and a shared memory
                 computer. It is available as open-source at
                 http://protomol.sourceforge.net. An auxiliary tool
                 allows the automatic selection of optimal parameters
                 for MG, and is available at
                 http://mdsimaid.cse.nd.edu.",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Jost:2005:WMP,
  author =       "G. Jost and J. Labarta and J. Gimenez",
  editor =       "????",
  booktitle =    "Shared Memory Parallel Programming with {OpenMP}",
  title =        "What Multilevel Parallel Programs do when you are not
                 watching: a Performance analysis case study comparing
                 {MPI\slash OpenMP}, {MLP}, and {Nested OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "3349",
  pages =        "29--??",
  year =         "2005",
  bibdate =      "Mon Oct 07 09:04:25 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Jung:2005:DIM,
  author =       "Hyungsoo Jung and Dongin Shin and Hyuck Han and Jai W.
                 Kim and Heon Y. Yeom and Jongsuk Lee",
  title =        "Design and Implementation of Multiple Fault-Tolerant
                 {MPI} over {Myrinet} ({$ M^3 $})",
  crossref =     "ACM:2005:PAI",
  pages =        "32--32",
  year =         "2005",
  bibdate =      "Tue Dec 27 07:58:16 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Kamal:2005:SVT,
  author =       "Humaira Kamal and Brad Penoff and Alan Wagner",
  title =        "{SCTP} versus {TCP} for {MPI}",
  crossref =     "ACM:2005:PAI",
  pages =        "30--30",
  year =         "2005",
  bibdate =      "Tue Dec 27 07:58:16 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Kappiah:2005:JTD,
  author =       "Nandini Kappiah and Vincent W. Freeh and David K.
                 Lowenthal",
  title =        "Just In Time Dynamic Voltage Scaling: Exploiting
                 Inter-Node Slack to Save Energy in {MPI} Programs",
  crossref =     "ACM:2005:PAI",
  pages =        "33--33",
  year =         "2005",
  bibdate =      "Tue Dec 27 07:58:16 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Karwande:2005:MPC,
  author =       "Amit Karwande and Xin Yuan and David K. Lowenthal",
  title =        "An {MPI} prototype for compiled communication on
                 {Ethernet} switched clusters",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "65",
  number =       "10",
  pages =        "1123--1133",
  month =        oct,
  year =         "2005",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:34 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Misc{Kepner:2005:PPM,
  author =       "Jeremy Kepner",
  title =        "Parallel Programming with {MatlabMPI}",
  howpublished = "World-Wide Web site.",
  year =         "2005",
  bibdate =      "Mon Dec 05 08:36:15 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.ll.mit.edu/MatlabMPI/",
  abstract =     "MatlabMPI is set of Matlab scripts that implement a
                 subset of MPI and allow any Matlab program to be run on
                 a parallel computer. The key innovation of MatlabMPI is
                 that it implements the widely used MPI ``look and
                 feel'' on top of standard Matlab file i/o, resulting in
                 a ``pure'' Matlab implementation that is exceedingly
                 small (about 300 lines of code). Thus, MatlabMPI will
                 run on any combination of computers that Matlab
                 supports. In addition, because of its small size, it is
                 simple to download and use (and modify if you like).",
  acknowledgement = ack-nhfb,
  keywords =     "Matlab; MatlabMPI; MPI; parallel processing",
}

@Article{Kranzlmuller:2005:RAP,
  author =       "Dieter Kranzlm{\"u}ller and Peter Kacsuk and Jack
                 Dongarra",
  title =        "Recent Advances in {Parallel Virtual Machine} and
                 {Message Passing Interface}",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "2",
  pages =        "99--101",
  month =        "Summer",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005054256",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/2/99.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Liu:2005:EIO,
  author =       "Z. Liu and L. Huang and B. Chapman and T. Weng",
  booktitle =    "Shared Memory Parallel Programming with {OpenMP}",
  title =        "Efficient Implementation of {OpenMP} for Clusters with
                 Implicit Data Distribution",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "3349",
  pages =        "121--??",
  year =         "2005",
  bibdate =      "Mon Oct 07 09:16:10 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Marowka:2005:EMT,
  author =       "Ami Marowka",
  title =        "Execution model of three parallel languages: {OpenMP},
                 {UPC} and {CAF}",
  journal =      j-SCI-PROG,
  volume =       "13",
  number =       "2",
  pages =        "127--135",
  month =        "????",
  year =         "2005",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Wed Sep 1 14:50:28 MDT 2010",
  bibsource =    "http://www.iospress.nl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Martorell:2005:BGP,
  author =       "X. Martorell and N. Smeds and R. Walkup and J. R.
                 Brunheroto and G. Alm{\'a}si and J. A. Gunnels and L.
                 DeRose and J. Labarta and F. Escal{\'e} and J.
                 Gim{\'e}nez and H. Servat and J. E. Moreira",
  title =        "{Blue Gene/L} performance tools",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "2/3",
  pages =        "407--424",
  month =        "????",
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 1 08:14:41 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/492/martorell.pdf",
  abstract =     "Good performance monitoring is the basis of modern
                 performance analysis tools for application
                 optimization. We are providing a variety of such
                 performance analysis tools for the new Blue Gene/L
                 supercomputer. Those tools can be divided into two
                 categories: single-node performance tools and multinode
                 performance tools. From a single-node perspective, we
                 provide standard interfaces and libraries, such as PAPI
                 and libHPM, that provide access to the hardware
                 performance counters for applications running on the
                 Blue Gene/L compute nodes. From a multinode
                 perspective, we focus on tools that analyze Message
                 Passing Interface (MPI) behavior. Those tools work by
                 first collecting message-passing trace data when a
                 program runs. The trace data is then used by graphical
                 interface tools that analyze the behavior of
                 applications. Using the current prototype tools, we
                 demonstrate their usefulness and applicability with
                 case studies of application optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0240",
}

@Article{Massaioli:2005:OPA,
  author =       "Federico Massaioli and Filippo Castiglione and Massimo
                 Bernaschi",
  title =        "{OpenMP} parallelization of agent-based models",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1066--1081",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Book{Mattson:2005:PPP,
  author =       "Timothy G. Mattson and Beverly A. Sanders and Berna
                 Massingill",
  title =        "Patterns for Parallel Programming",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "xiii + 355",
  year =         "2005",
  ISBN =         "0-321-22811-1 (hardcover)",
  ISBN-13 =      "978-0-321-22811-6 (hardcover)",
  LCCN =         "QA76.642 .M38 2005",
  bibdate =      "Sat Oct 5 10:09:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  URL =          "http://www.loc.gov/catdir/toc/ecip0418/2004013240.html",
  abstract =     "The Parallel Programming Guide for Every Software
                 Developer From grids and clusters to next-generation
                 game consoles, parallel computing is going mainstream.
                 Innovations such as Hyper-Threading Technology,
                 HyperTransport Technology, and multicore
                 microprocessors from IBM, Intel, and Sun are
                 accelerating the movement's growth. Only one thing is
                 missing: programmers with the skills to meet the
                 soaring demand for parallel software. That's where
                 Patterns for Parallel Programming comes in. It's the
                 first parallel programming guide written specifically
                 to serve working software developers, not just computer
                 scientists. The authors introduce a complete, highly
                 accessible pattern language that will help any
                 experienced developer ``think parallel''-and start
                 writing effective parallel code almost immediately.
                 Instead of formal theory, they deliver proven solutions
                 to the challenges faced by parallel programmers, and
                 pragmatic guidance for using today's parallel APIs in
                 the real world. Coverage includes: Understanding the
                 parallel computing landscape and the challenges faced
                 by parallel developers Finding the concurrency in a
                 software design problem and decomposing it into
                 concurrent tasks Managing the use of data across tasks
                 Creating an algorithm structure that effectively
                 exploits the concurrency you've identified Connecting
                 your algorithmic structures to the APIs needed to
                 implement them Specific software constructs for
                 implementing parallel programs Working with today's
                 leading parallel programming environments: OpenMP, MPI,
                 and Java Patterns have helped thousands of programmers
                 master object-oriented development and other complex
                 programming technologies. With this book, you will
                 learn that they're the best way to master parallel
                 programming too.",
  acknowledgement = ack-nhfb,
  author-dates = "1958--",
  subject =      "Parallel programming (Computer science)",
  tableofcontents = "1: A pattern language for parallel programming \\
                 2: Background and jargon of parallel computing \\
                 3: The finding concurrency design space \\
                 4: The algorithm structure design space \\
                 5: The supporting structures design space \\
                 6: The implementation mechanisms design space \\
                 Appendix A: A brief introduction to OpenMP \\
                 Appendix B: A brief introduction to MPI \\
                 Appendix C: A brief introduction to concurrent
                 programming in Java",
}

@InProceedings{Mavriplis:2005:HRAa,
  author =       "Dimitri J. Mavriplis and Michael J. Aftosmis and
                 Marsha Berger",
  title =        "High Resolution Aerospace Applications using the {NASA
                 Columbia Supercomputer}",
  crossref =     "ACM:2005:PAI",
  pages =        "61--61",
  year =         "2005",
  bibdate =      "Tue Dec 27 07:58:16 MST 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper focuses on the parallel performance of two
                 high-performance aerodynamic simulation packages on the
                 newly installed NASA Columbia supercomputer. These
                 packages include both a high-fidelity, unstructured,
                 Reynolds-averaged Navier--Stokes solver, and a
                 fully-automated inviscid flow package for cut-cell
                 Cartesian grids. The complementary combination of these
                 two simulation codes enables high-fidelity
                 characterization of aerospace vehicle design
                 performance over the entire flight envelope through
                 extensive parametric analysis and detailed simulation
                 of critical regions of the flight envelope. Both
                 packages are industrial-level codes designed for
                 complex geometry and incorporate customized multigrid
                 solution algorithms. The performance of these codes on
                 Columbia is examined using both MPI and OpenMP and
                 using both the NUMAlink and InfiniBand interconnect
                 fabrics. Numerical results demonstrate good scalability
                 on up to 2016 cpus using the NUMAlink4 interconnect,
                 with measured computational rates in the vicinity of 3
                 TFLOP/s, while InfiniBand showed some performance
                 degradation at high CPU counts, particularly with
                 multigrid. Nonetheless, the results are encouraging
                 enough to indicate that larger test cases using
                 combined MPI/OpenMP communication should scale well on
                 even more processors.",
  acknowledgement = ack-nhfb,
  remark =       "Co-winner of best paper award. Also published in
                 \cite{Mavriplis:2007:HRAb}.",
}

@Article{Medvedev:2005:OMA,
  author =       "Dmitry M. Medvedev and Evelyn M. Goldfield and Stephen
                 K. Gray",
  title =        "An {OpenMP\slash MPI} approach to the parallelization
                 of iterative four-atom quantum mechanics",
  journal =      j-COMP-PHYS-COMM,
  volume =       "166",
  number =       "2",
  pages =        "94--108",
  day =          "1",
  month =        mar,
  year =         "2005",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2004.11.001",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:41:51 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465504005260",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Midorikawa:2005:PNM,
  author =       "Edson Toshimi Midorikawa and Helio Marci Oliveira and
                 Jean Marcos Laine",
  title =        "{PEMPIs}: a New Methodology for Modeling and
                 Prediction of {MPI} Programs Performance",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "33",
  number =       "5",
  pages =        "499--527",
  month =        oct,
  year =         "2005",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-005-7303-y",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:39 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=33&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=33&issue=5&spage=499",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "analytical modeling; graphical models; message
                 passing; MPI; Performance prediction",
}

@Article{Nagle:2005:BRM,
  author =       "Dan Nagle",
  title =        "Book Review: {{\em MPI --- The Complete Reference,
                 Vol. 1, The MPI Core}, 2nd ed., Scientific and
                 Engineering Computation Series, by Marc Snir, Steve
                 Otto, Steven Huss--Lederman, David Walker and Jack
                 Dongarra}",
  journal =      j-SCI-PROG,
  volume =       "13",
  number =       "1",
  pages =        "57--63",
  month =        "????",
  year =         "2005",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Wed Sep 1 14:50:28 MDT 2010",
  bibsource =    "http://www.iospress.nl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Nakajima:2005:PIS,
  author =       "Kengo Nakajima",
  title =        "Parallel iterative solvers for finite-element methods
                 using an {OpenMP\slash MPI} hybrid programming model on
                 the {Earth Simulator}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1048--1065",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Nakajima:2005:TLH,
  author =       "Kengo Nakajima",
  title =        "Three-level hybrid vs. flat {MPI} on the {Earth
                 Simulator}: Parallel iterative solvers for
                 finite-element method",
  journal =      j-APPL-NUM-MATH,
  volume =       "54",
  number =       "2",
  pages =        "237--255",
  month =        jul,
  year =         "2005",
  CODEN =        "ANMAEL",
  ISSN =         "0168-9274 (print), 1873-5460 (electronic)",
  ISSN-L =       "0168-9274",
  bibdate =      "Tue Aug 24 11:17:20 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/01689274",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Numerical Mathematics: Transactions of IMACS",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01689274",
}

@Article{Norcen:2005:HPJ,
  author =       "Roland Norcen and Andreas Uhl",
  title =        "High performance {JPEG 2000} and {MPEG-4 VTC} on
                 {SMPs} using {OpenMP}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1082--1098",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Orlando:2005:PSP,
  author =       "Salvatore Orlando and Domenico Laforenza",
  title =        "Preface: Selected Papers from the {EUROPVM\slash MPI
                 2003 Conference, Venice, Italy, 29 September--2 October
                 2003}",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "1",
  pages =        "47--47",
  month =        "Spring",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005051520",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/1/47.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Park:2005:SOA,
  author =       "Inho Park and Seon Wook Kim",
  title =        "Study of {OpenMP} applications on the
                 {InfiniBand}-based software distributed shared-memory
                 system",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "1099--1113",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Book{Pharr:2005:GGP,
  editor =       "Matt Pharr and Randima Fernando",
  title =        "{GPU} gems 2: programming techniques for
                 high-performance graphics and general-purpose
                 computation",
  volume =       "2",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "xlix + 814",
  year =         "2005",
  ISBN =         "0-321-33559-7 (hardcover)",
  ISBN-13 =      "978-0-321-33559-3 (hardcover)",
  LCCN =         "T385 .G688 2005",
  bibdate =      "Thu Jul 29 13:36:54 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/numana2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "GPU gems",
  URL =          "http://www-docs.tu-cottbus.de/bibliothek/public/katalog/420569.PDF;
                 http://www.loc.gov/catdir/toc/ecip055/2004030181.html",
  abstract =     "This sequel to the best-selling, first volume of GPU
                 Gems details the latest programming techniques for
                 today's graphics processing units (GPUs). As GPUs find
                 their way into mobile phones, handheld gaming devices,
                 and consoles, GPU expertise is even more critical in
                 today's competitive environment. Real-time graphics
                 programmers will discover the latest algorithms for
                 creating advanced visual effects, strategies for
                 managing complex scenes, and techniques for advanced
                 image processing. Readers will also learn new methods
                 for using the substantial processing power of the GPU
                 in other computationally intensive applications, such
                 as scientific computing and finance. Twenty of the
                 book's forty-eight chapters are devoted to GPGPU
                 programming, from basic concepts to advanced
                 techniques. Written by experts in cutting-edge GPU
                 programming, this book offers readers practical means
                 to harness the enormous capabilities of GPUs.",
  acknowledgement = ack-nhfb,
  keywords =     "CUDA; nVIDIA",
  remark =       "CD-ROM contents: Complementary examples and samples.",
}

@InProceedings{Pjesivac-Grbovic:2005:PAM,
  author =       "J. Pjesivac-Grbovic and T. Angskun and G. Bosilca and
                 G. E. Fagg and E. Gabriel and J. J. Dongarra",
  title =        "Performance Analysis of {MPI} Collective Operations",
  crossref =     "IEEE:2005:IPD",
  pages =        "272a-272a",
  year =         "2005",
  bibdate =      "Fri May 27 10:13:34 2005",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Article{Rantakokko:2005:DMO,
  author =       "Jarmo Rantakokko",
  title =        "A Dynamic {MPI--OpenMP} Model for Structured Adaptive
                 Mesh Refinement",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "15",
  number =       "1/2",
  pages =        "37--47",
  month =        mar # "\slash " # jun,
  year =         "2005",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626405002040",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Roberti:2005:PIL,
  author =       "Debora R. Roberti and Roberto P. Souto and Haroldo F.
                 Campos Velho and Gervasio A. Degrazia and Domenico
                 Anfossi",
  title =        "Parallel Implementation of a {Lagrangian} Stochastic
                 Model for Pollutant Dispersion",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "33",
  number =       "5",
  pages =        "485--498",
  month =        oct,
  year =         "2005",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-005-7302-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:39 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=33&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=33&issue=5&spage=485",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "High performance application; MPI; pollutant
                 dispersion",
}

@Article{Rufai:2005:MPO,
  author =       "Raimi Rufai and Muslim Bozyigit and Jaralla Alghamdi
                 and Moataz Ahmed",
  title =        "Multithreaded Parallelism with {OpenMP}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "15",
  number =       "4",
  pages =        "367--378",
  month =        dec,
  year =         "2005",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626405002283",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Sankaran:2005:LMC,
  author =       "Sriram Sankaran and Jeffrey M. Squyres and Brian
                 Barrett and Vishal Sahay and Andrew Lumsdaine and Jason
                 Duell and Paul Hargrove and Eric Roman",
  title =        "The {LAM\slash MPI} Checkpoint\slash Restart
                 Framework: System-Initiated Checkpointing",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "4",
  pages =        "479--493",
  month =        "Winter",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005056139",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/4/479.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Santhanaraman:2005:DZC,
  author =       "Gopalakrishnan Santhanaraman and Jiesheng Wu and Wei
                 Huang and Dhabaleswar K. Panda",
  title =        "Designing Zero-Copy {Message Passing Interface}
                 Derived Datatype Communication Over {Infiniband}:
                 Alternative Approaches and Performance Evaluation",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "2",
  pages =        "129--142",
  month =        "Summer",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005054259",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/2/129.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Selikhov:2005:CMB,
  author =       "A. Selikhov and C. Germain",
  title =        "A {Channel Memory} based fault tolerance for {MPI}
                 applications",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "21",
  number =       "5",
  pages =        "709--715",
  month =        may,
  year =         "2005",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jul 15 08:00:46 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Book{Sloan:2005:HPL,
  author =       "Joseph D. (Joseph Donald) Sloan",
  title =        "High performance {Linux} clusters with {OSCAR},
                 {Rocks}, {openMosix}, and {MPI}",
  publisher =    pub-ORA,
  address =      pub-ORA:adr,
  pages =        "xv + 350",
  year =         "2005",
  ISBN =         "0-596-00570-9",
  ISBN-13 =      "978-0-596-00570-2",
  LCCN =         "QA76.58; QA76.58 .S56 2005eb; QA76.58 .S56 2005;
                 QA76.58 .S58 2005; QA76.58 .S595 2005",
  bibdate =      "Tue Aug 5 17:41:39 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 melvyl.cdlib.org:210/CDL90",
  URL =          "http://www.oreilly.com/catalog/9780596005702",
  acknowledgement = ack-nhfb,
  subject =      "Linux; Parallel processing (Electronic computers);
                 Electronic data processing; Distributed processing",
}

@Article{Thakur:2005:OCC,
  author =       "Rajeev Thakur and Rolf Rabenseifner and William
                 Gropp",
  title =        "Optimization of Collective Communication Operations in
                 {MPICH}",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "1",
  pages =        "49--66",
  month =        "Spring",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005051521",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/1/49.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Thakur:2005:OSO,
  author =       "Rajeev Thakur and William Gropp and Brian Toonen",
  title =        "Optimizing the Synchronization Operations in {Message
                 Passing Interface} One-Sided Communication",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "2",
  pages =        "119--128",
  month =        "Summer",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005054258",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/2/119.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Tian:2005:CEN,
  author =       "Xinmin Tian and Jay P. Hoeflinger and Grant Haab and
                 Yen-Kuang Chen and Milind Girkar and Sanjiv Shah",
  title =        "A compiler for exploiting nested parallelism in
                 {OpenMP} programs",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "31",
  number =       "10--12",
  pages =        "960--983",
  month =        oct # "\slash " # dec,
  year =         "2005",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:04 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Tian:2005:PCT,
  author =       "Xinmin Tian and Milind Girkar and Aart Bik and Hideki
                 Saito",
  title =        "Practical Compiler Techniques on Efficient
                 Multithreaded Code Generation for {OpenMP} Programs",
  journal =      j-COMP-J,
  volume =       "48",
  number =       "5",
  pages =        "588--601",
  month =        sep,
  year =         "2005",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxh109",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Tue Nov 8 05:58:50 MST 2005",
  bibsource =    "http://comjnl.oxfordjournals.org/content/vol48/issue5/index.dtl;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://comjnl.oxfordjournals.org/cgi/content/abstract/48/5/588;
                 http://comjnl.oxfordjournals.org/cgi/reprint/48/5/588",
  acknowledgement = ack-nhfb,
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
}

@InProceedings{Wiese:2005:IPN,
  author =       "Kay C. Wiese and Andrew Hendriks and Alain Deschenes
                 and Belgacem {Ben Youssef}",
  title =        "The Impact of Pseudorandom Number Quality on
                 {P-RnaPredict}, a Parallel Genetic Algorithm for {RNA}
                 Secondary Structure Prediction",
  crossref =     "Beyer:2005:GEC",
  pages =        "479--480",
  year =         "2005",
  DOI =          "https://doi.org/10.1145/1068009.1068089",
  bibdate =      "Mon Mar 5 22:02:35 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.cs.bham.ac.uk/~wbl/biblio/gecco2005lbp/papers/52-wiese.pdf",
  abstract =     "We present a parallel version of RnaPredict, a genetic
                 algorithm (GA) for RNA secondary structure prediction.
                 The research presented here builds on previous work and
                 examines the impact of three different pseudorandom
                 number generators (PRNGs) on the GA's performance. The
                 three generators tested are the C standard library PRNG
                 RAND, a parallelised multiplicative congruential
                 generator (MCG), and a parallelized Mersenne Twister
                 (MT). A fully parallel version of RnaPredict using the
                 Message Passing Interface (MPI) was implemented. The
                 PRNG comparison tests were performed with known
                 structures that are 118, 122, 543, and 556 nucleotides
                 in length. The effects of the PRNGs are investigated
                 and the predicted structures are compared to known
                 structures",
  acknowledgement = ack-nhfb,
}

@Article{Willcock:2005:UMC,
  author =       "Jeremiah Willcock and Andrew Lumsdaine and Arch
                 Robison",
  title =        "Using {MPI} with {C\#} and the {Common Language
                 Infrastructure}",
  journal =      j-CCPE,
  volume =       "17",
  number =       "7--8",
  pages =        "895--917",
  month =        jun # "\slash " # jul,
  year =         "2005",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.861",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:57 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "23 Feb 2005",
}

@Article{Yu:2005:HPB,
  author =       "Weikuan Yu and Sayantan Sur and Dhabaleswar K. Panda
                 and Rob T. Aulwes and Rich L. Graham",
  title =        "High Performance Broadcast Support in {LA-MPI} Over
                 Quadrics",
  journal =      j-IJHPCA,
  volume =       "19",
  number =       "4",
  pages =        "453--463",
  month =        "Winter",
  year =         "2005",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342005056145",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/19/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/19/4/453.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Zhang:2005:ULC,
  author =       "Youhui Zhang and Dongsheng Wong and Weimin Zheng",
  title =        "User-level checkpoint and recovery for {LAM\slash
                 MPI}",
  journal =      j-OPER-SYS-REV,
  volume =       "39",
  number =       "3",
  pages =        "72--81",
  month =        jul,
  year =         "2005",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:48 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Zheng:2005:SBP,
  author =       "Gengbin Zheng and Terry Wilmarth and Praveen
                 Jagadishprasad and Laxmikant V. Kal{\'e}",
  title =        "Simulation-Based Performance Prediction for Large
                 Parallel Machines",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "33",
  number =       "2--3",
  pages =        "183--207",
  month =        jun,
  year =         "2005",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-005-3582-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:27 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=33&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=33&issue=2&spage=183",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "adaptive MPI; CHARMH; computation modeling; large
                 parallel machines; Simulation-based performance
                 prediction",
}

@Article{Ayguade:2006:ENO,
  author =       "Eduard Ayguade and Marc Gonzalez and Xavier Martorell
                 and Gabriele Jost",
  title =        "Employing nested {OpenMP} for the parallelization of
                 multi-zone computational fluid dynamics applications",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "5",
  pages =        "686--697",
  month =        may,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:34 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Barton:2006:SMP,
  author =       "Christopher Barton and C{\u{a}}lin Cas{\c{c}}aval and
                 George Alm{\'a}si and Yili Zheng and Montse Farreras
                 and Siddhartha Chatterje and Jos{\'e} Nelson Amaral",
  title =        "Shared memory programming for large scale machines",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "6",
  pages =        "108--117",
  month =        jun,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1133981.1133995",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:42:48 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper describes the design and implementation of
                 a scalable run-time system and an optimizing compiler
                 for Unified Parallel C (UPC). An experimental
                 evaluation on BlueGene/L{\reg}, a distributed-memory
                 machine, demonstrates that the combination of the
                 compiler with the runtime system produces programs with
                 performance comparable to that of efficient MPI
                 programs and good performance scalability up to
                 hundreds of thousands of processors. Our runtime system
                 design solves the problem of maintaining shared object
                 consistency efficiently in a distributed memory
                 machine. Our compiler infrastructure simplifies the
                 code generated for parallel loops in UPC through the
                 elimination of affinity tests, eliminates several
                 levels of indirection for accesses to segments of
                 shared arrays that the compiler can prove to be local,
                 and implements remote update operations through a
                 lower-cost asynchronous message. The performance
                 evaluation uses three well-known benchmarks --- HPC
                 RandomAccess, HPC STREAM and NAS CG --- to obtain
                 scaling and absolute performance numbers for these
                 benchmarks on up to 131072 processors, the full
                 BlueGene/L machine. These results were used to win the
                 HPC Challenge Competition at SC05 in Seattle WA,
                 demonstrating that PGAS languages support both
                 productivity and performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "BlueGene; PGAS programming model; UPC",
}

@Article{Battre:2006:MFP,
  author =       "Dominic Battr{\'e} and David Sigfredo Angulo",
  title =        "{MPI} framework for parallel searching in large
                 biological databases",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "12",
  pages =        "1503--1511",
  month =        dec,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:35 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Becciani:2006:FMP,
  author =       "U. Becciani and M. Comparato and V.
                 Antonuccio-Delogu",
  title =        "{FLY MPI-2}: a parallel tree code for {LSS}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "174",
  number =       "7",
  pages =        "605--606",
  day =          "1",
  month =        apr,
  year =         "2006",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2006.01.002",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Feb 14 14:46:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465506000713",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Bouteiller:2006:HPS,
  author =       "Aur{\'e}lien Bouteiller and Hinde-Lilia Bouziane and
                 Thomas Herault and Pierre Lemarinier and Franck
                 Cappello",
  title =        "Hybrid Preemptive Scheduling of {Message Passing
                 Interface} Applications on {Grids}",
  journal =      j-IJHPCA,
  volume =       "20",
  number =       "1",
  pages =        "77--90",
  month =        "Spring",
  year =         "2006",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342006062526",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/20/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/20/1/77.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Bouteiller:2006:MVP,
  author =       "A. Bouteiller and T. Herault and G. Krawezik and P.
                 Lemarinier and F. Cappello",
  title =        "{MPICH-V} Project: a Multiprotocol Automatic
                 Fault-Tolerant {MPI}",
  journal =      j-IJHPCA,
  volume =       "20",
  number =       "3",
  pages =        "319--333",
  month =        "Fall",
  year =         "2006",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342006067469",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/20/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/20/3/319.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Costa:2006:ROA,
  author =       "J. J. Costa and T. Cortes and X. Martorell and E.
                 Ayguade and J. Labarta",
  title =        "Running {OpenMP} applications efficiently on an
                 everything-shared {SDSM}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "5",
  pages =        "647--658",
  month =        may,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:34 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@InProceedings{delCuvillo:2006:LOC,
  author =       "Juan del Cuvillo and Weirong Zhu and Guang Gao",
  title =        "Landing {OpenMP} on {Cyclops-64}: an efficient mapping
                 of {OpenMP} to a many-core system-on-a-chip",
  crossref =     "ACM:2006:PCC",
  pages =        "41--50",
  year =         "2006",
  DOI =          "https://doi.org/10.1145/1128022.1128030",
  bibdate =      "Tue Jun 20 06:42:45 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper presents our experience mapping OpenMP
                 parallel programming model to the IBM Cyclops-64 (C64)
                 architecture. The C64 employs a many-core-on-a-chip
                 design that integrates processing logic (160 thread
                 units), embedded memory (5MB) and communication
                 hardware on the same die. Such a unique architecture
                 presents new opportunities for optimization.
                 Specifically, we consider the following three areas:
                 (1) a memory aware runtime library that places
                 frequently used data structures in scratchpad memory;
                 (2) a unique spin lock algorithm for shared memory
                 synchronization based on in-memory atomic instructions
                 and native support for thread level execution; (3) a
                 fast barrier that directly uses C64 hardware support
                 for collective synchronization. All three optimizations
                 together, result in an 80\% overhead reduction for
                 language constructs in OpenMP. We believe that such a
                 drastic reduction in the cost of managing parallelism
                 makes OpenMP more amenable for writing parallel
                 programs on the C64 platform.",
  acknowledgement = ack-nhfb,
}

@Article{Deng:2006:PIK,
  author =       "Junjun Deng and Hengyong Yu and Jun Ni and Tao He and
                 Shiying Zhao and Lihe Wang and Ge Wang",
  title =        "A Parallel Implementation of the Katsevich Algorithm
                 for {$3$-D CT} Image Reconstruction",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "38",
  number =       "1",
  pages =        "35--47",
  month =        oct,
  year =         "2006",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-6675-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:29 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=38&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=38&issue=1&spage=35",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Computed tomography (CT); high performance computing;
                 image reconstruction; Katsevich algorithm; medical
                 imaging; MPI; parallel computing; spiral cone-beam CT",
}

@Article{Donev:2006:ICF,
  author =       "Aleksander Donev",
  title =        "Interoperability with {C} in {Fortran 2003}",
  journal =      j-FORTRAN-FORUM,
  volume =       "25",
  number =       "1",
  pages =        "8--12",
  month =        apr,
  year =         "2006",
  DOI =          "https://doi.org/10.1145/1124708.1124710",
  ISSN =         "1061-7264 (print), 1931-1311 (electronic)",
  ISSN-L =       "1061-7264",
  bibdate =      "Wed Apr 12 07:18:43 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "One of the major new features in the Fortran 2003 is
                 features for interoperability with C Interop. The
                 intrinsic module ISO\_C\_BINDING provides: * constants,
                 mostly type parameters, C\_NULL\_CHAR, C\_NULL\_PTR,
                 and others, * types, and in particular, TYPE(C\_PTR)
                 and TYPE(C\_FUNPTR), * procedures, such as C\_LOC,
                 C\_FUNLOC, C\_F\_POINTER, C\_F\_PROCPOINTER and
                 C\_ASSOCIATED. A Fortran interface can be specified for
                 a C function with external linkage and used to invoke
                 such a function. The interface has the characteristic
                 BIND(C) label, and must also satisfy some additional
                 restrictions. C Interop can be used to portably use
                 multi-language codes in Fortran. Since most languages
                 interoperate with C, the feature can actually be used
                 to interoperate with other programming languages as
                 well. C Interop can also be used to give access to
                 Fortran programmers to the many standard libraries with
                 widely-used and implemented C interfaces. This includes
                 lower-level tasks such as interfacing with the OS on
                 UNIX-based systems, or using special libraries like
                 OpenGL. For simple API's, developing Fortran interfaces
                 is practically trivial once one gets some experience.
                 For more complicated API's whose full
                 functionality/power is not needed, such as for example
                 TCP/IP sockets or shared-memory segments on UNIX
                 systems, it is often easier to develop a condensed C
                 API/library that does the actual work, and is simpler
                 to interface to from Fortran. However, for libraries
                 like OpenGL, one should provide a full Fortran
                 interface so that the whole API can be accessed. Doing
                 this manually is not easy and is also error-prone due
                 to the size of the OpenGL/GLU/GLUT interfaces. For
                 certain libraries like MPI, a special Fortran interface
                 may be defined for the purposes of efficiency,
                 portability, ease-of-use, or to accommodate for
                 language semantic differences. In this first paper, we
                 will show how to develop a Fortran interface for a
                 simple C API/library. In a second paper, we consider
                 automating the process so that large and more complex
                 API's, and in particular, OpenGL, can be handled. The
                 source codes can be obtained at
                 http://atom.princeton.edu/donev/F2x.Along the way, we
                 identify some problems with the design of C Interop in
                 Fortran 2003.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Fortran Forum",
  xxCODEN =      "????",
}

@Article{Drosinos:2006:EPT,
  author =       "Nikolaos Drosinos and Nectarios Koziris",
  title =        "The Effect of Process Topology and Load Balancing on
                 Parallel Programming Models for {SMP} Clusters and
                 Iterative Algorithms",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "35",
  number =       "1",
  pages =        "65--91",
  month =        jan,
  year =         "2006",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-1156-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:27 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=35&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=35&issue=1&spage=65",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "high performance computing; hybrid programming;
                 iterative algorithms; MPI; OpenMP; parallel
                 programming; SMP clusters; tiling",
}

@Article{Huang:2006:ECS,
  author =       "Jih-Woei Huang and Chih-Ping Chu",
  title =        "An Efficient Communication Scheduling Method for the
                 Processor Mapping Technique Applied Data
                 Redistribution",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "37",
  number =       "3",
  pages =        "297--318",
  month =        sep,
  year =         "2006",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-6615-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:29 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=37&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=37&issue=3&spage=297",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "communication scheduling; data redistribution;
                 data-parallel programming; MPI; parallel compiler;
                 processor mapping",
}

@Article{Krawezik:2006:PCM,
  author =       "G{\'e}raud Krawezik and Franck Cappello",
  title =        "Performance comparison of {MPI} and {OpenMP} on shared
                 memory multiprocessors",
  journal =      j-CCPE,
  volume =       "18",
  number =       "1",
  pages =        "29--61",
  month =        jan,
  year =         "2006",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.905",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:00 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "11 Oct 2005",
}

@Article{Lastovetsky:2006:HTM,
  author =       "Alexey Lastovetsky and Ravi Reddy",
  title =        "{HeteroMPI}: Towards a message-passing library for
                 heterogeneous networks of computers",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "2",
  pages =        "197--220",
  month =        feb,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:34 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Le:2006:DMC,
  author =       "Thuy T. Le and Jalel Rejeb",
  title =        "A detailed {MPI} communication model for distributed
                 systems",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "22",
  number =       "3",
  pages =        "269--278",
  month =        feb,
  year =         "2006",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:05 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Lee:2006:PT,
  author =       "Edward A. Lee",
  title =        "The Problem with Threads",
  journal =      j-COMPUTER,
  volume =       "39",
  number =       "5",
  pages =        "33--42",
  month =        may,
  year =         "2006",
  CODEN =        "CPTRB4",
  DOI =          "https://doi.org/10.1109/MC.2006.180",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Fri Jul 4 17:16:20 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "For concurrent programming to become mainstream, we
                 must discard threads as a programming model.
                 Nondeterminism should be judiciously and carefully
                 introduced where needed, and it should be explicit in
                 programs. In general-purpose software engineering
                 practice, we have reached a point where one approach to
                 concurrent programming dominates all others namely,
                 threads, sequential processes that share memory. They
                 represent a key concurrency model supported by modern
                 computers, programming languages, and operating
                 systems. In scientific computing, where performance
                 requirements have long demanded concurrent programming,
                 data-parallel language extensions and message-passing
                 libraries such as PVM, MPI, and OpenMP dominate over
                 threads for concurrent programming. Computer
                 architectures intended for scientific computing often
                 differ significantly from so-called general-purpose
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Liao:2006:SDI,
  author =       "Wei-keng Liao and Kenin Coloma and Alok Choudhary and
                 Lee Ward and Eric Russell and Neil Pundit",
  title =        "Scalable Design and Implementations for {MPI} Parallel
                 Overlapping {I/O}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "17",
  number =       "11",
  pages =        "1264--1276",
  month =        nov,
  year =         "2006",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2006.163",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 14:26:50 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Lopez:2006:ESM,
  author =       "F. C. Garc{\'\i}a L{\'o}pez and N. L. Fr{\'\i}as
                 Arrocha",
  title =        "An efficient synchronization model for {OpenMP}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "11",
  pages =        "1359--1365",
  month =        nov,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:35 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Marowka:2006:BRP,
  author =       "Ami Marowka",
  title =        "Book Review: {{\em Parallel Scientific Computation: A
                 Structured Approach using BSP and MPI}}",
  journal =      j-SCPE,
  volume =       "7",
  number =       "2",
  pages =        "107--108",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1895-1767",
  bibdate =      "Thu Sep 2 11:55:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/7/2.toc",
  URL =          "http://www.scpe.org/vols/vol07/no2/vol07no2bookreview.html",
  acknowledgement = ack-nhfb,
}

@Article{Mehta:2006:MSG,
  author =       "Paras Mehta and Jos{\'e} Nelson Amaral and Duane
                 Szafron",
  title =        "Is {MPI} suitable for a generative design-pattern
                 system?",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "32",
  number =       "7--8",
  pages =        "616--626",
  month =        sep,
  year =         "2006",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:05 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Norden:2006:OVM,
  author =       "Markus Nord{\'e}n and Sverker Holmgren and Michael
                 Thun{\'e}",
  title =        "{OpenMP} versus {MPI} for {PDE} solvers based on
                 regular sparse numerical operators",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "22",
  number =       "1--2",
  pages =        "194--203",
  month =        jan,
  year =         "2006",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:05 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{ODowd:2006:WGM,
  author =       "Padraig J. O'Dowd and Adarsh Patil and John P.
                 Morrison",
  title =        "{WebCom-G} and {MPICH-G2} Jobs",
  journal =      j-SCPE,
  volume =       "7",
  number =       "3",
  pages =        "75--86",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1895-1767",
  bibdate =      "Thu Sep 2 11:55:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/7/3.toc",
  URL =          "http://www.scpe.org/vols/vol07/no3/SCPE_7_3_07.pdf;
                 http://www.scpe.org/vols/vol07/no3/SCPE_7_3_07.zip",
  acknowledgement = ack-nhfb,
}

@Article{Ohara:2006:MMP,
  author =       "M. Ohara and H. Inoue and Y. Sohda and H. Komatsu and
                 T. Nakatani",
  title =        "{MPI} microtask for programming the {Cell Broadband
                 Engine{\TM}} processor",
  journal =      j-IBM-SYS-J,
  volume =       "45",
  number =       "1",
  pages =        "85--102",
  month =        "????",
  year =         "2006",
  CODEN =        "IBMSA7",
  ISSN =         "0018-8670",
  bibdate =      "Mon Feb 12 18:19:14 MST 2007",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/sj/451/ohara.html",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Systems Journal",
  ordernumber =  "????",
}

@InProceedings{Paul:2006:TLF,
  author =       "Jerome L. Paul and Michal Kouril and Kenneth A.
                 Berman",
  title =        "A template library to facilitate teaching message
                 passing parallel computing",
  crossref =     "ACM:2006:PST",
  pages =        "464--468",
  year =         "2006",
  DOI =          "https://doi.org/10.1145/1121341.1121487",
  bibdate =      "Tue Jun 20 06:51:37 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper discusses a template-based approach to aid
                 in introducing the upper-division undergraduate (or
                 first year graduate) to the rapidly emerging message
                 passing parallel computing paradigm. Our template
                 library facilitates an accelerated MPI programming
                 learning environment that can realistically be included
                 as one topic among many in an algorithms course. One
                 template module is based on a backtracking solution to
                 the satisfiability problem (SAT), which the student
                 first solves in the sequential setting. With the aid of
                 a modified template, the student then develops a simple
                 parallel SAT solver. The template includes such things
                 as I/O functions, allowing the student to focus on the
                 algorithm itself. The parallel part is partially
                 provided by the template, with indicators given in
                 places where the student needs to plug in missing MPI
                 function calls. The students are excited about this
                 hands-on-experience in the increasingly important world
                 of message passing parallel computing, which might be
                 missed if their curriculum does not include a course
                 devoted to this topic.",
  acknowledgement = ack-nhfb,
}

@Article{Rozman:2006:CPL,
  author =       "Igor Rozman and Marjan {\v{s}}terk and Roman Trobec",
  title =        "Communication Performance of {LAM\slash MPI} and
                 {MPICH} on a {Linux} Cluster",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "16",
  number =       "3",
  pages =        "323--334",
  month =        sep,
  year =         "2006",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626406002678",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Su:2006:APP,
  author =       "Hai-Jun Su and J. Michael McCarthy and Masha Sosonkina
                 and Layne T. Watson",
  title =        "{Algorithm 857}: {POLSYS\_GLP}---a parallel general
                 linear product homotopy code for solving polynomial
                 systems of equations",
  journal =      j-TOMS,
  volume =       "32",
  number =       "4",
  pages =        "561--579",
  month =        dec,
  year =         "2006",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1186785.1186789",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Sat Apr 14 09:48:57 MDT 2007",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Globally convergent, probability-one homotopy methods
                 have proven to be very effective for finding all the
                 isolated solutions to polynomial systems of equations.
                 After many years of development, homotopy path trackers
                 based on probability-one homotopy methods are reliable
                 and fast. Now, theoretical advances reducing the number
                 of homotopy paths that must be tracked and handling
                 singular solutions have made probability-one homotopy
                 methods even more practical. POLSYS\_GLP consists of
                 Fortran 95 modules for finding all isolated solutions
                 of a complex coefficient polynomial system of
                 equations. The package is intended to be used on a
                 distributed memory multiprocessor in conjunction with
                 HOMPACK90 (Algorithm 777), and makes extensive use of
                 Fortran 95-derived data types and MPI to support a
                 general linear product (GLP) polynomial system
                 structure. GLP structure is intermediate between the
                 partitioned linear product structure used by
                 POLSYS\_PLP (Algorithm 801) and the BKK-based structure
                 used by PHCPACK. The code requires a GLP structure as
                 input, and although finding the optimal GLP structure
                 is a difficult combinatorial problem, generally
                 physical or engineering intuition about a problem
                 yields a very good GLP structure. POLSYS\_GLP employs a
                 sophisticated power series end game for handling
                 singular solutions, and provides support for problem
                 definition both at a high level and via hand-crafted
                 code. Different GLP structures and their corresponding
                 Bezout numbers can be systematically explored before
                 committing to root finding.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Mathematical Software",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Weatherly:2006:DMS,
  author =       "D. Brent Weatherly and David K. Lowenthal and Mario
                 Nakazawa and Franklin Lowenthal",
  title =        "{Dyn-MPI}: Supporting {MPI} on medium-scale,
                 non-dedicated clusters",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "6",
  pages =        "822--838",
  month =        jun,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 22:04:47 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Zarrelli:2006:EPE,
  author =       "Roberto Zarrelli and Mario Petrone and Angelo
                 Iannaccio",
  title =        "Enabling {PVM} to exploit the {SCTP} protocol",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "11",
  pages =        "1472--1479",
  month =        nov,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:35 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Zheng:2006:PEA,
  author =       "Gengbin Zheng and Chao Huang and Laxmikant V.
                 Kal{\'e}",
  title =        "Performance evaluation of automatic checkpoint-based
                 fault tolerance for {AMPI} and {Charm++}",
  journal =      j-OPER-SYS-REV,
  volume =       "40",
  number =       "2",
  pages =        "90--99",
  month =        apr,
  year =         "2006",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:43 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Adhianto:2007:PMC,
  author =       "Laksono Adhianto and Barbara Chapman",
  title =        "Performance modeling of communication and computation
                 in hybrid {MPI} and {OpenMP} applications",
  journal =      j-SIM-MODEL-PRACT-THEORY,
  volume =       "15",
  number =       "4",
  pages =        "481--491",
  month =        apr,
  year =         "2007",
  CODEN =        "SMPTCA",
  DOI =          "https://doi.org/10.1016/j.simpat.2006.11.014",
  ISSN =         "1569-190X (print), 1878-1462 (electronic)",
  ISSN-L =       "1569-190X",
  bibdate =      "Mon Oct 07 09:21:03 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.sciencedirect.com/science/article/pii/S1569190X06001109",
  acknowledgement = ack-nhfb,
  fjournal =     "Simulation Modelling Practice and Theory",
}

@Article{anMey:2007:NPO,
  author =       "Dieter an Mey and Samuel Sarholz and Christian
                 Terboven",
  title =        "Nested Parallelization with {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "5",
  pages =        "459--476",
  month =        oct,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0054-1",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:48 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=5&spage=459",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "ccNUMA; Nested parallelization; OpenMP; Shared memory
                 parallelization",
}

@Article{Ayguade:2007:SIO,
  author =       "Eduard Ayguad{\'e} and Matthias S. Mueller",
  title =        "Special Issue on {OpenMP} --- {Guest Editors}'
                 Introduction",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "4",
  pages =        "331--333",
  month =        aug,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0048-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:44 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=4&spage=331",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Becciani:2007:FMH,
  author =       "U. Becciani and V. Antonuccio-Delogu and M.
                 Comparato",
  title =        "{FLY}: {MPI}-2 high resolution code for {LSS}
                 cosmological simulations",
  journal =      j-COMP-PHYS-COMM,
  volume =       "176",
  number =       "3",
  pages =        "211--217",
  day =          "1",
  month =        feb,
  year =         "2007",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2006.10.001",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:42:13 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465506003687",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Bronevetsky:2007:CFS,
  author =       "Greg Bronevetsky and Bronis R. de Supinski",
  title =        "Complete Formal Specification of the {OpenMP} Memory
                 Model",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "4",
  pages =        "335--392",
  month =        aug,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0051-4",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:44 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=4&spage=335",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "formal systems; OpenMP; parallel programming; theorem
                 proving",
}

@Article{Brown:2007:HSP,
  author =       "Russell Brown and Ilya Sharapov",
  title =        "High-Scalability Parallelization of a Molecular
                 Modeling Application: Performance and Productivity
                 Comparison Between {OpenMP} and {MPI} Implementations",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "5",
  pages =        "441--458",
  month =        oct,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0057-y",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:48 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=5&spage=441",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Molecular modeling; MPI; OpenMP; Parallel
                 programming",
}

@Article{Buntinas:2007:IES,
  author =       "Darius Buntinas and Guillaume Mercier and William
                 Gropp",
  title =        "Implementation and evaluation of shared-memory
                 communication and synchronization operations in
                 {MPICH2} using the {Nemesis} communication subsystem",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "33",
  number =       "9",
  pages =        "634--644",
  month =        sep,
  year =         "2007",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:07 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Carbajal:2007:PTD,
  author =       "Santiago Garcia Carbajal",
  title =        "Parallelizing Three Dimensional Cellular Automata with
                 {OpenMP}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "17",
  number =       "4",
  pages =        "349--361",
  month =        dec,
  year =         "2007",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626407003083",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Chandra:2007:ESP,
  author =       "Sumir Chandra and Xiaolin Li and Taher Saif and Manish
                 Parashar",
  title =        "Enabling scalable parallel implementations of
                 structured adaptive mesh refinement applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "39",
  number =       "2",
  pages =        "177--203",
  month =        feb,
  year =         "2007",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0110-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:30 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=39&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=39&issue=2&spage=177",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "3-D Richtmyer-Meshkov application; Bin-packing based
                 load-balancing; Hierarchical partitioning; MPI
                 non-blocking communication optimization; SAMR
                 scalability; Structured adaptive mesh refinement",
}

@Article{Chau:2007:MIP,
  author =       "Ming Chau and Didier {El Baz} and Ronan Guivarch and
                 Pierre Spiteri",
  title =        "{MPI} implementation of parallel subdomain methods for
                 linear and nonlinear convection--diffusion problems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "67",
  number =       "5",
  pages =        "581--591",
  month =        may,
  year =         "2007",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:35 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Desai:2007:CEM,
  author =       "Narayan Desai and Ewing Lusk and Rick Bradshaw",
  title =        "A Composition Environment for {MPI} Programs",
  journal =      j-IJHPCA,
  volume =       "21",
  number =       "2",
  pages =        "166--173",
  month =        may,
  year =         "2007",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342007077858",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/21/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/21/2/166.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{DiMartino:2007:SIS,
  author =       "Beniamino {Di Martino} and Dieter Kranzlm{\"u}ller and
                 Jack Dongarra",
  title =        "Special issue on selected papers from the
                 {EuroPVM\slash MPI 2005 Conference, Sorrento, Italy,
                 18-21 September 2005} --- Preface",
  journal =      j-IJHPCA,
  volume =       "21",
  number =       "2",
  pages =        "129--131",
  month =        "Summer",
  year =         "2007",
  DOI =          "https://doi.org/10.1177/1094342006077863",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Jun 4 08:20:03 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Duran:2007:PEH,
  author =       "Alejandro Duran and Roger Ferrer and Juan Jos{\'e}
                 Costa and Marc Gonz{\`a}lez and Xavier Martorell and
                 Eduard Ayguad{\'e} and Jes{\'u}s Labarta",
  title =        "A Proposal for Error Handling in {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "4",
  pages =        "393--416",
  month =        aug,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0049-y",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:44 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=4&spage=393",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "error handling; fault tolerance; OpenMP; parallel
                 languages; parallel programming",
}

@Article{Falzone:2007:PMF,
  author =       "Christopher Falzone and Anthony Chan and Ewing Lusk
                 and William Gropp",
  title =        "A Portable Method for Finding User Errors in the Usage
                 of {MPI} Collective Operations",
  journal =      j-IJHPCA,
  volume =       "21",
  number =       "2",
  pages =        "155--165",
  month =        may,
  year =         "2007",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342007077860",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/21/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/21/2/155.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Genaud:2007:PMP,
  author =       "St{\'e}phane Genaud and Choopan Rattanapoka",
  title =        "{P2P--MPI}: a Peer-to-Peer Framework for Robust
                 Execution of Message Passing Parallel Programs on
                 {Grids}",
  journal =      j-J-GRID-COMP,
  volume =       "5",
  number =       "1",
  pages =        "27--42",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10723-006-9056-2",
  ISSN =         "1570-7873 (print), 1572-9184 (electronic)",
  ISSN-L =       "1570-7873",
  bibdate =      "Wed Jul 9 17:01:30 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1570-7873&volume=5&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=1570-7873&volume=5&issue=1&spage=27",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Grid Computing",
  journal-URL =  "http://link.springer.com/journal/10723",
  keywords =     "Grid; Java; Key words; middleware; MPI; peer-to-peer",
}

@Article{Giannoutakis:2007:MHP,
  author =       "K. M. Giannoutakis and G. A. Gravvanis and B. Clayton
                 and A. Patil and T. Enright and J. P. Morrison",
  title =        "Matching high performance approximate inverse
                 preconditioning to architectural platforms",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "42",
  number =       "2",
  pages =        "145--163",
  month =        nov,
  year =         "2007",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0129-1",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:32 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=42&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=42&issue=2&spage=145",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Approximate inverses; Globus toolkit; MPI; Open MPI;
                 Parallel iterative methods; Parallel/distributed
                 computations; Preconditioned conjugate gradient
                 method",
}

@Article{Graham:2007:OMH,
  author =       "Richard L. Graham and Brian W. Barrett and Galen M.
                 Shipman and Timothy S. Woodall and George Bosilca",
  title =        "{Open MPI}: a High Performance, Flexible
                 Implementation of {MPI} Point-to-Point Communications",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "17",
  number =       "1",
  pages =        "79--88",
  month =        mar,
  year =         "2007",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626407002880",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Gropp:2007:TSM,
  author =       "William Gropp and Rajeev Thakur",
  title =        "Thread-safety in an {MPI} implementation: Requirements
                 and analysis",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "33",
  number =       "9",
  pages =        "595--604",
  month =        sep,
  year =         "2007",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:07 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@InProceedings{Gu:2007:IPC,
  author =       "Feng Long Gu and Hyacinthe Nzigou M. and Guilherme de
                 Melo Baptista Domingues and Takeshi Nanri and Kazuaki
                 Murakami",
  title =        "Investigating the Performance of Collective
                 Communications on {SMP} Clusters: a Case for {{\tt
                 MPI\_Allgather}}",
  crossref =     "Simos:2007:CMS",
  volume =       "2A",
  pages =        "52--56",
  year =         "2007",
  bibdate =      "Thu Feb 21 14:34:40 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://proceedings.aip.org/getpdf/servlet/GetPDFServlet?filetype=pdf&amp;
                 id=APCPCS000963000002000052000001&amp; idtype=cvips",
  acknowledgement = ack-nhfb,
}

@Article{Klemm:2007:JIO,
  author =       "Michael Klemm and Matthias Bezold and Ronald Veldema
                 and Michael Philippsen",
  title =        "{JaMP}: an implementation of {OpenMP} for a {Java
                 DSM}",
  journal =      j-CCPE,
  volume =       "19",
  number =       "18",
  pages =        "2333--2352",
  day =          "25",
  month =        dec,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1178",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:18 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "3 Apr 2007",
}

@Article{Kurzyniec:2007:UCA,
  author =       "Dawid Kurzyniec and Magdalena Slawi{\'n}ska and
                 Jaroslaw Slawi{\'n}ski and Vaidy Sunderam",
  title =        "{Unibus}: a contrarian approach to {Grid} computing",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "42",
  number =       "1",
  pages =        "125--144",
  month =        oct,
  year =         "2007",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-0033-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:32 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=42&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=42&issue=1&spage=125",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Aggregation; Grids; MPI; Resource sharing;
                 Virtualization",
}

@Article{Latham:2007:IMI,
  author =       "Robert Latham and Robert Ross and Rajeev Thakur",
  title =        "Implementing {MPI-IO} Atomic Mode and Shared File
                 Pointers Using {MPI} One-Sided Communication",
  journal =      j-IJHPCA,
  volume =       "21",
  number =       "2",
  pages =        "132--143",
  month =        may,
  year =         "2007",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342007077859",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/21/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/21/2/132.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Li:2007:DIV,
  author =       "Kuan-Ching Li and Hsun-Chang Chang",
  title =        "The design and implementation of visual performance
                 monitoring and analysis toolkit for cluster and {Grid}
                 environments",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "40",
  number =       "3",
  pages =        "299--317",
  month =        jun,
  year =         "2007",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-0020-5",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:31 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=40&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=40&issue=3&spage=299",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Distributed computing; Monitoring; MPI parallel
                 program; Performance visualization",
}

@Article{Liao:2007:CCS,
  author =       "Wei-keng Liao and Kenin Coloma and Alok Choudhary and
                 Lee Ward",
  title =        "Cooperative Client-Side File Caching for {MPI}
                 Applications",
  journal =      j-IJHPCA,
  volume =       "21",
  number =       "2",
  pages =        "144--154",
  month =        may,
  year =         "2007",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342007077857",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/21/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/21/2/144.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Liao:2007:OOP,
  author =       "Chunhua Liao and Oscar Hernandez and Barbara Chapman
                 and Wenguang Chen and Weimin Zheng",
  title =        "{OpenUH}: an optimizing, portable {OpenMP} compiler",
  journal =      j-CCPE,
  volume =       "19",
  number =       "18",
  pages =        "2317--2332",
  day =          "25",
  month =        dec,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1174",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:18 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "3 Apr 2007",
}

@Article{Marathe:2007:SCC,
  author =       "Jaydeep Marathe and Frank Mueller",
  title =        "Source-Code-Correlated Cache Coherence
                 Characterization of {OpenMP} Benchmarks",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "18",
  number =       "6",
  pages =        "818--834",
  month =        jun,
  year =         "2007",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2007.1058",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 14:26:52 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Marowka:2007:PCD,
  author =       "Ami Marowka",
  title =        "Parallel computing on any desktop",
  journal =      j-CACM,
  volume =       "50",
  number =       "9",
  pages =        "74--78",
  month =        sep,
  year =         "2007",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/1284621.1284622",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Mon Jun 16 18:32:57 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallelization lets applications exploit the high
                 throughput of new multicore processors, and the OpenMP
                 parallel programming model helps developers create
                 multithreaded applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Mohr:2007:SPE,
  author =       "Bernd Mohr and Jesper Larsson Tr{\"a}ff and Joachim
                 Worringen",
  title =        "Selected papers from {EuroPVM\slash MPI 2006}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "33",
  number =       "9",
  pages =        "593--594",
  month =        sep,
  year =         "2007",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:07 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Morris:2007:SNO,
  author =       "Alan Morris and Allen D. Malony and Sameer S. Shende",
  title =        "Supporting Nested {OpenMP} Parallelism in the {TAU}
                 Performance System",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "4",
  pages =        "417--436",
  month =        aug,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0050-5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:44 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=4&spage=417",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Nested parallelism; OpenMP; TAU",
}

@Article{Nascimento:2007:DDS,
  author =       "Aline P. Nascimento and Alexandre C. Sena and Cristina
                 Boeres and Vinod E. F. Rebello",
  title =        "Distributed and dynamic self-scheduling of parallel
                 {MPI Grid} applications",
  journal =      j-CCPE,
  volume =       "19",
  number =       "14",
  pages =        "1955--1974",
  day =          "25",
  month =        sep,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1139",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:16 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 Nov 2006",
}

@Article{Norden:2007:DDM,
  author =       "Markus Nord{\'e}n and Henrik L{\"o}f and Jarmo
                 Rantakokko and Sverker Holmgren",
  title =        "Dynamic Data Migration for Structured {AMR} Solvers",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "5",
  pages =        "477--491",
  month =        oct,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0056-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:48 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=5&spage=477",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Adaptive mesh refinement; cc-NUMA; Geographical
                 locality; Graph partitioning; OpenMP; Page migration;
                 SAMR; Shared memory",
}

@Article{Pandey:2007:SCM,
  author =       "Nirved Pandey and G. K. Sharma",
  title =        "Startup comparison for message passing libraries with
                 {DTM} on {Linux} clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "39",
  number =       "1",
  pages =        "59--72",
  month =        jan,
  year =         "2007",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-0004-5",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:30 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=39&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=39&issue=1&spage=59",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Chameleon MPICH; Distributed Performance Index (DPI);
                 Distributed Task Machine (DTM); High Performance
                 Cluster (HPC); Message Passing Interface (MPI); MPI--
                 Parallel Virtual Machine (PVM); Relative Distributed
                 Performance Index (RDPI)",
}

@Article{Pedicini:2007:PPE,
  author =       "Marco Pedicini and Francesco Quaglia",
  title =        "{PELCR}: {Parallel} environment for optimal
                 lambda-calculus reduction",
  journal =      j-TOCL,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1243996.1243997",
  ISSN =         "1529-3785 (print), 1557-945X (electronic)",
  ISSN-L =       "1529-3785",
  bibdate =      "Mon Jun 16 14:28:15 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "In this article we present the implementation of an
                 environment supporting L{\'e}vy's optimal reduction for
                 the $ \lambda $-calculus on parallel (or distributed)
                 computing systems. In a similar approach to Lamping's,
                 we base our work on a graph reduction technique, known
                 as directed virtual reduction, which is actually a
                 restriction of Danos-Regnier virtual reduction.\par

                 The environment, which we refer to as PELCR (parallel
                 environment for optimal lambda-calculus reduction),
                 relies on a strategy for directed virtual reduction,
                 namely half combustion. While developing PELCR we
                 adopted both a message aggregation technique, allowing
                 reduction of the communication overhead, and a fair
                 policy for distributing dynamically originated load
                 among processors.\par

                 We also present an experimental study demonstrating the
                 ability of PELCR to definitely exploit the parallelism
                 intrinsic to $ \lambda $-terms while performing the
                 reduction. We show how PELCR allows achieving up to
                 70--80\% of the ideal speedup on last generation
                 multiprocessor computing systems. As a last note, the
                 software modules have been developed with the C
                 language and using a standard interface for message
                 passing, that is, MPI, thus making PELCR itself a
                 highly portable software package.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Computational Logic",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J773",
  keywords =     "functional programming; geometry of interaction;
                 linear logic; optimal reduction; parallel
                 implementation; virtual reduction",
}

@Article{Pjesivac-Grbovic:2007:MCA,
  author =       "Jelena Pje{\v{s}}ivac-Grbovi{\'c} and George Bosilca
                 and Graham E. Fagg and Thara Angskun and Jack J.
                 Dongarra",
  title =        "{MPI} collective algorithm selection and quadtree
                 encoding",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "33",
  number =       "9",
  pages =        "613--623",
  month =        sep,
  year =         "2007",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:07 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Pjesivac-Grbovic:2007:PAM,
  author =       "Jelena Pjesivac-Grbovic and Thara Angskun and George
                 Bosilca and Graham E. Fagg and Edgar Gabriel and Jack
                 J. Dongarra",
  title =        "Performance analysis of {MPI} collective operations",
  journal =      "The Journal of Networks, Software Tools, and Cluster
                 Computing",
  volume =       "10",
  number =       "2",
  pages =        "127--143",
  month =        "????",
  year =         "2007",
  DOI =          "https://doi.org/10.1007/s10586-007-0012-0",
  ISSN =         "1386-7857",
  bibdate =      "Tue Jun 4 08:20:03 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "Denver, CO",
  conference-date = "Apr 03--08, 2005",
  conference-name = "4th International Workshop on Performance
                 Modelling, Evaluation, and Optimization of Parallel and
                 Distributed Systems",
}

@Article{Ramadan:2007:TDM,
  author =       "Omar Ramadan",
  title =        "Three dimensional {MPI} parallel implementation of the
                 {PML} algorithm for truncating finite-difference
                 time-domain {Grids}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "33",
  number =       "2",
  pages =        "109--115",
  month =        mar,
  year =         "2007",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:06 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Rycerz:2007:IBS,
  author =       "Katarzyna Rycerz and Alfredo Tirado-Ramos and Alessia
                 Gualandris and Simon F. Portegies Zwart and Marian
                 Bubak and Peter M. A. Sloot",
  title =        "Interactive {N}-Body Simulations on the {Grid}: {HLA}
                 Versus {MPI}",
  journal =      j-IJHPCA,
  volume =       "21",
  number =       "2",
  pages =        "210--221",
  month =        may,
  year =         "2007",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342007074874",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/21/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/21/2/210.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Tsujita:2007:RMP,
  author =       "Y. Tsujita",
  title =        "Remote {MPI-I/O} on a Parallel Virtual File System
                 Using a Circular Buffer for High Throughput",
  journal =      j-INT-J-COMPUT-APPL,
  volume =       "29",
  number =       "3",
  pages =        "291--299",
  year =         "2007",
  DOI =          "https://doi.org/10.1080/1206212X.2007.11441859",
  ISSN =         "1206-212X (print), 1925-7074 (electronic)",
  ISSN-L =       "1206-212X",
  bibdate =      "Sat Apr 21 17:24:05 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijca.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.tandfonline.com/doi/full/10.1080/1206212X.2007.11441859",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Computer Applications",
  journal-URL =  "https://www.tandfonline.com/loi/tjca20",
  online-date =  "11 Jul 2015",
}

@Article{Wang:2007:EAP,
  author =       "Perry H. Wang and Jamison D. Collins and Gautham N.
                 Chinya and Hong Jiang and Xinmin Tian and Milind Girkar
                 and Nick Y. Yang and Guei-Yuan Lueh and Hong Wang",
  title =        "{EXOCHI}: architecture and programming environment for
                 a heterogeneous multi-core multithreaded system",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "156--166",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1250734.1250753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Future mainstream microprocessors will likely
                 integrate specialized accelerators, such as GPUs, onto
                 a single die to achieve better performance and power
                 efficiency. However, it remains a keen challenge to
                 program such a heterogeneous multicore platform, since
                 these specialized accelerators feature ISAs and
                 functionality that are significantly different from the
                 general purpose CPU cores. In this paper, we present
                 EXOCHI: (1) Exoskeleton Sequencer (EXO), an
                 architecture to represent heterogeneous accelerators as
                 ISA-based MIMD architecture resources, and a shared
                 virtual memory heterogeneous multithreaded program
                 execution model that tightly couples specialized
                 accelerator cores with general-purpose CPU cores, and
                 (2) C for Heterogeneous Integration (CHI), an
                 integrated C/C++ programming environment that supports
                 accelerator-specific inline assembly and
                 domain-specific languages. The CHI compiler extends the
                 OpenMP pragma for heterogeneous multithreading
                 programming, and produces a single fat binary with code
                 sections corresponding to different instruction sets.
                 The runtime can judiciously spread parallel computation
                 across the heterogeneous cores to optimize performance
                 and power.\par

                 We have prototyped the EXO architecture on a physical
                 heterogeneous platform consisting of an Intel{\reg}
                 Core{\TM} 2 Duo processor and an 8-core 32-thread
                 Intel{\reg} Graphics Media Accelerator X3000. In
                 addition, we have implemented the CHI integrated
                 programming environment with the Intel{\reg} C++
                 Compiler, runtime toolset, and debugger. On the EXO
                 prototype system, we have enhanced a suite of
                 production-quality media kernels for video and image
                 processing to utilize the accelerator through the CHI
                 programming interface, achieving significant speedup
                 (1.41X to10.97X) over execution on the IA32 CPU
                 alone.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "GPU; heterogeneous multi-cores; openMP",
}

@Article{Weng:2007:OIS,
  author =       "Tien-Hsiung Weng and Ruey-Kuen Perng and Barbara
                 Chapman",
  title =        "{OpenMP} Implementation of {SPICE3} Circuit
                 Simulator",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "5",
  pages =        "493--505",
  month =        oct,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0053-2",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:48 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=5&spage=493",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "OpenMP SPICE circuit simulator; Shared-memory
                 programming model",
}

@Article{Wu:2007:IFR,
  author =       "C.-L. Wu and D.-C. Lou and S.-Y. Chen",
  title =        "Integer factorization for {RSA} cryptosystem under a
                 {PVM} environment",
  journal =      j-INT-J-COMPUT-SYST-SCI-ENG,
  volume =       "22",
  number =       "1--2",
  pages =        "??--??",
  month =        jan # "\slash " # mar,
  year =         "2007",
  CODEN =        "CSSEEI",
  ISSN =         "0267-6192",
  bibdate =      "Tue Dec 03 12:31:25 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computsystscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Computer Systems Science and
                 Engineering",
  remark =       "Special issue: Privacy Data Management",
}

@Article{Zhong:2007:PPS,
  author =       "Wei Zhong and Gulsah Altun and Xinmin Tian and Robert
                 Harrison and Phang C. Tai and Yi Pan",
  title =        "Parallel protein secondary structure prediction
                 schemes using {Pthread} and {OpenMP} over
                 hyper-threading technology",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "41",
  number =       "1",
  pages =        "1--16",
  month =        jul,
  year =         "2007",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0100-1",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:31 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=41&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=41&issue=1&spage=1",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "BLOSUM62 matrix; DBNN (Denoeux Belief Neural Network);
                 Hydrophobicity matrix; Hyper-threading; MPI (Message
                 Passing Interface); Neural networks; OpenMP; Parallel
                 architecture; Protein secondary structure prediction;
                 PSSM (Position Specific Scoring Matrix); Pthread;
                 Speedup",
}

@Article{Akzhalova:2008:WPL,
  author =       "Assel Zh. Akzhalova and Daniar Y. Aizhulov and
                 Galymzhan Seralin and Gulnar Balakayeva",
  title =        "{Web} portal for large-scale computations based on
                 {Grid} and {MPI}",
  journal =      j-SCPE,
  volume =       "9",
  number =       "2",
  pages =        "135--142",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  ISSN =         "1895-1767",
  bibdate =      "Thu Sep 2 11:55:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/9/2.toc",
  URL =          "http://www.scpe.org/vols/vol09/no2/SCPE_9_2_06.pdf;
                 http://www.scpe.org/vols/vol09/no2/SCPE_9_2_06.zip",
  acknowledgement = ack-nhfb,
}

@TechReport{Baboulin:2008:SID,
  author =       "Marc Baboulin and Jack J. Dongarra and Stanimire
                 Tomov",
  title =        "Some Issues in Dense Linear Algebra for Multicore and
                 Special Purpose Architectures",
  type =         "LAPACK Working Note",
  number =       "200",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn200.pdf",
  abstract =     "We address some key issues in designing dense linear
                 algebra (DLA) algorithms that are common for both
                 multi/many-cores and special purpose architectures (in
                 particular GPUs). We present them in the context of an
                 LU factorization algorithm, where randomization
                 techniques are used as an alternative to pivoting. This
                 approach yields an algorithm based entirely on a
                 collection of small Level 3 BLAS type computational
                 tasks, which has emerged as a common goal in designing
                 DLA algorithms for new architectures. Other common
                 trends, also considered here, are block asynchronous
                 task execution and ``Block'' layouts for the data
                 associated with the separate tasks. We present
                 numerical results and other specific experiments with
                 DLA algorithms on NVIDIA GPUs using CUDA. The GPU
                 results are also of interest themselves as we show a
                 performance of up to 160 Glop/s on a single Quadro FX
                 5600 card. Keywords: dense linear algebra, parallel
                 algorithms, LU factorization, multicore processors,
                 graphic process units.",
  acknowledgement = ack-nhfb,
  utknumber =    "UT-CS-08-615",
}

@Article{Bernabeu:2008:MPA,
  author =       "Miguel O. Bernabeu and Pedro Alonso and Antonio M.
                 Vidal",
  title =        "A multilevel parallel algorithm to solve symmetric
                 {Toeplitz} linear systems",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "44",
  number =       "3",
  pages =        "237--256",
  month =        jun,
  year =         "2008",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0157-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:34 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=44&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=44&issue=3&spage=237",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Cauchy-like matrix; MPI; Multilevel parallel
                 programming; OpenMP; Rank displacement; Toeplitz
                 matrix",
}

@InProceedings{Bischof:2008:PRM,
  author =       "Christian Bischof and Niels Guertler and Andreas
                 Kowarz",
  title =        "Parallel Reverse Mode Automatic Differentiation for
                 {OpenMP} Programs with {ADOL-C}",
  crossref =     "Bischof:2008:AAD",
  volume =       "64",
  pages =        "163--173",
  year =         "2008",
  DOI =          "https://doi.org/10.1007/978-3-540-68942-3_15",
  bibdate =      "Sat Dec 22 08:33:39 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-540-68942-3_15",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-540-68942-3",
  book-URL =     "http://www.springerlink.com/content/978-3-540-68942-3",
}

@Article{Bondhugula:2008:PAP,
  author =       "Uday Bondhugula and Albert Hartono and J. Ramanujam
                 and P. Sadayappan",
  title =        "A practical automatic polyhedral parallelizer and
                 locality optimizer",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "101--113",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1379022.1375595",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:53 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We present the design and implementation of an
                 automatic polyhedral source-to-source transformation
                 framework that can optimize regular programs (sequences
                 of possibly imperfectly nested loops) for parallelism
                 and locality simultaneously. Through this work, we show
                 the practicality of analytical model-driven automatic
                 transformation in the polyhedral model -- far beyond
                 what is possible by current production compilers.
                 Unlike previous works, our approach is an end-to-end
                 fully automatic one driven by an integer linear
                 optimization framework that takes an explicit view of
                 finding good ways of tiling for parallelism and
                 locality using affine transformations. The framework
                 has been implemented into a tool to automatically
                 generate OpenMP parallel code from C program sections.
                 Experimental results from the tool show very high
                 speedups for local and parallel execution on
                 multi-cores over state-of-the-art compiler frameworks
                 from the research community as well as the best native
                 production compilers. The system also enables the easy
                 use of powerful empirical/iterative optimization for
                 general arbitrarily nested loop sequences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "affine transformations; automatic parallelization;
                 locality optimization; loop transformations; polyhedral
                 model; tiling",
}

@Article{Buntinas:2008:BVN,
  author =       "Darius Buntinas and Camille Coti and Thomas Herault
                 and Pierre Lemarinier and Laurence Pilard and Ala
                 Rezmerita and Eric Rodriguez and Franck Cappello",
  title =        "Blocking vs. non-blocking coordinated checkpointing
                 for large-scale fault tolerant {MPI} Protocols",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "24",
  number =       "1",
  pages =        "73--84",
  month =        jan,
  year =         "2008",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:10 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Book{Chapman:2008:UOP,
  author =       "Barbara Chapman and Gabriele Jost and Ruud van der
                 Pas",
  title =        "Using {OpenMP}: portable shared memory parallel
                 programming",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xxii + 353",
  year =         "2008",
  ISBN =         "0-262-03377-1 (hardcover), 0-262-53302-2 (paperback)",
  ISBN-13 =      "978-0-262-03377-0 (hardcover), 978-0-262-53302-7
                 (paperback)",
  LCCN =         "QA76.642 .C49 2008",
  bibdate =      "Sat Oct 5 07:59:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Scientific and engineering computation",
  URL =          "http://www.loc.gov/catdir/toc/ecip0721/2007026656.html",
  abstract =     "This title presents a comprehensive overview of
                 OpenMP, the standard application programming interface
                 for shared memory parallel computing - a reference for
                 students and professionals. OpenMP, a portable
                 programming interface for shared memory parallel
                 computers, was adopted as an informal standard in 1997
                 by computer scientists who wanted a unified model on
                 which to base programs for shared memory systems.
                 OpenMP is now used by many software developers; it
                 offers significant advantages over both hand-threading
                 and MPI. ``Using OpenMP'' offers a comprehensive
                 introduction to parallel programming concepts and a
                 detailed overview of OpenMP. ``Using OpenMP'' discusses
                 hardware developments, describes where OpenMP is
                 applicable, and compares OpenMP to other programming
                 interfaces for shared and distributed memory parallel
                 architectures. It introduces the individual features of
                 OpenMP, provides many source code examples that
                 demonstrate the use and functionality of the language
                 constructs, and offers tips on writing an efficient
                 OpenMP program. It describes how to use OpenMP in
                 full-scale applications to achieve high performance on
                 large-scale architectures, discussing several case
                 studies in detail, and offers in-depth troubleshooting
                 advice.",
  acknowledgement = ack-nhfb,
  author-dates = "1954--",
  subject =      "Parallel programming (Computer science); Application
                 program interfaces (Computer software)",
  tableofcontents = "1. Introduction \\
                 2. Overview of OpenMP \\
                 3. Writing a First OpenMP Program \\
                 4. OpenMP Language Features \\
                 5. How to Get Good Performance by Using OpenMP \\
                 6. Using OpenMP in the Real World \\
                 7. Troubleshooting \\
                 8. Under the Hood: How OpenMP Really Works \\
                 9. The Future of OpenMP",
}

@Article{Che:2008:PSG,
  author =       "Shuai Che and Michael Boyer and Jiayuan Meng and David
                 Tarjan and Jeremy W. Sheaffer and Kevin Skadron",
  title =        "A performance study of general-purpose applications on
                 graphics processors using {CUDA}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "68",
  number =       "10",
  pages =        "1370--1380",
  month =        oct,
  year =         "2008",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 16:27:23 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Dalcin:2008:MPP,
  author =       "Lisandro Dalc{\'\i}n and Rodrigo Paz and Mario Storti
                 and Jorge D'El{\'\i}a",
  title =        "{MPI} for {Python}: Performance improvements and
                 {MPI-2} extensions",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "68",
  number =       "5",
  pages =        "655--662",
  month =        may,
  year =         "2008",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:36 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{DiMartino:2008:SSG,
  author =       "Beniamino {Di Martino} and Dieter Kranzlm{\"u}ller and
                 Jack Dongarra",
  title =        "Special section: {Grid} computing and the {Message
                 Passing Interface}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "24",
  number =       "2",
  pages =        "119--120",
  month =        feb,
  year =         "2008",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Faraj:2008:SPA,
  author =       "Ahmad Faraj and Pitch Patarasuk and Xin Yuan",
  title =        "A Study of Process Arrival Patterns for {MPI}
                 Collective Operations",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "6",
  pages =        "543--570",
  month =        dec,
  year =         "2008",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:46 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=6;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=6&spage=543",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Freeh:2008:JTD,
  author =       "Vincent W. Freeh and Nandini Kappiah and David K.
                 Lowenthal and Tyler K. Bletsch",
  title =        "Just-in-time dynamic voltage scaling: Exploiting
                 inter-node slack to save energy in {MPI} programs",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "68",
  number =       "9",
  pages =        "1175--1185",
  month =        sep,
  year =         "2008",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 16:27:22 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Fujimoto:2008:DMV,
  author =       "Noriyuki Fujimoto",
  title =        "Dense Matrix-Vector Multiplication on the {CUDA}
                 Architecture",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "18",
  number =       "4",
  pages =        "511--530",
  month =        dec,
  year =         "2008",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626408003545",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Gao:2008:GEI,
  author =       "Guang R. Gao and Mitsuhisa Sato and Eduard
                 Ayguad{\'e}",
  title =        "{Guest Editors} Introduction: Special Issue on
                 {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "3",
  pages =        "287--288",
  month =        jun,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-008-0076-3",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:10 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=3&spage=287",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Garland:2008:PCE,
  author =       "Michael Garland and Scott {Le Grand} and John Nickolls
                 and Joshua Anderson and Jim Hardwick and Scott Morton
                 and Everett Phillips and Yao Zhang and Vasily Volkov",
  title =        "Parallel Computing Experiences with {CUDA}",
  journal =      j-IEEE-MICRO,
  volume =       "28",
  number =       "4",
  pages =        "13--27",
  month =        jul # "\slash " # aug,
  year =         "2008",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2008.57",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Tue Sep 9 15:18:16 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Genaud:2008:EPC,
  author =       "St{\'e}phane Genaud and Pierre Gan{\c{c}}arski and
                 Guillaume Latu and Alexandre Blansch{\'e} and Choopan
                 Rattanapoka and Damien Vouriot",
  title =        "Exploitation of a parallel clustering algorithm on
                 commodity hardware with {P2P-MPI}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "43",
  number =       "1",
  pages =        "21--41",
  month =        jan,
  year =         "2008",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0136-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:33 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=43&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=43&issue=1&spage=21",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Clustering; Evolutionary algorithms; Grid; Java;
                 Parallel algorithms",
}

@Article{Gregoretti:2008:MGE,
  author =       "F. Gregoretti and G. Laccetti and A. Murli and G.
                 Oliva and U. Scafuri",
  title =        "{MGF}: a grid-enabled {MPI} library",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "24",
  number =       "2",
  pages =        "158--165",
  month =        feb,
  year =         "2008",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Ha:2008:NBP,
  author =       "Phuong Hoai Ha and Philippas Tsigas and Otto J.
                 Anshus",
  title =        "Non-blocking programming on multi-core graphics
                 processors: (extended abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "19--28",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556448",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper investigates the synchronization power of
                 coalesced memory accesses, a family of memory access
                 mechanisms introduced in recent large multicore
                 architectures like the CUDA graphics processors. We
                 first design three memory access models to capture the
                 fundamental features of the new memory access
                 mechanisms. Subsequently, we prove the exact
                 synchronization power of these models in terms of their
                 consensus numbers. These tight results show that the
                 coalesced memory access mechanisms can facilitate
                 strong synchronization between the threads of multicore
                 processors, without the need of synchronization
                 primitives other than reads and writes.\par

                 Moreover, based on the intrinsic features of recent GPU
                 architectures, we construct strong synchronization
                 objects like wait-free and t-resilient
                 read-modify-write objects for a general model of recent
                 GPU architectures without strong hardware
                 synchronization primitives like test-and-set and
                 compare-and-swap. Accesses to the wait-free objects
                 have time complexity $ O(N) $, where $N$ is the number
                 of processes. Our result demonstrates that it is
                 possible to construct waitfree synchronization
                 mechanisms for GPUs without the need of strong
                 synchronization primitives in hardware and that
                 wait-free programming is possible for GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Hou:2008:BBS,
  author =       "Qiming Hou and Kun Zhou and Baining Guo",
  title =        "{BSGP}: bulk-synchronous {GPU} programming",
  journal =      j-TOG,
  volume =       "27",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2008",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/1360612.1360618",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Tue Aug 12 13:40:36 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "We present BSGP, a new programming language for
                 general purpose computation on the GPU. A BSGP program
                 looks much the same as a sequential C program.
                 Programmers only need to supply a bare minimum of extra
                 information to describe parallel processing on GPUs. As
                 a result, BSGP programs are easy to read, write, and
                 maintain. Moreover, the ease of programming does not
                 come at the cost of performance. A well-designed BSGP
                 compiler converts BSGP programs to kernels and combines
                 them using optimally allocated temporary streams. In
                 our benchmark, BSGP programs achieve similar or better
                 performance than well-optimized CUDA programs, while
                 the source code complexity and programming time are
                 significantly reduced. To test BSGP's code efficiency
                 and ease of programming, we implemented a variety of
                 GPU applications, including a highly sophisticated X3D
                 parser that would be extremely difficult to develop
                 with existing GPU programming languages.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
  keywords =     "bulk synchronous parallel programming; programable
                 graphics hardware; stream processing; thread
                 manipulation",
}

@InCollection{Howes:2008:U,
  author =       "L. Howes and D. B. Thomas",
  title =        "Efficient Random Number Generation and Application
                 Using {CUDA}",
  crossref =     "Nguyen:2008:GG",
  chapter =      "37",
  pages =        "805--830",
  year =         "2008",
  bibdate =      "Sat Feb 08 18:40:34 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "random-number generation",
}

@Article{Huang:2008:FPM,
  author =       "Jih-Woei Huang and Chih-Ping Chu",
  title =        "A flexible processor mapping technique toward data
                 localization for block-cyclic data redistribution",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "45",
  number =       "2",
  pages =        "151--172",
  month =        aug,
  year =         "2008",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0166-9",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:35 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=45&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=45&issue=2&spage=151",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Data distribution; Data-parallel programming;
                 Distributed memory multicomputers; HPF; MPI; Processor
                 mapping",
}

@Article{Jeun:2008:OPB,
  author =       "Woo-Chul Jeun and Yang-Suk Kee and Soonhoi Ha and
                 Changdon Kee",
  title =        "Overcoming performance bottlenecks in using {OpenMP}
                 on {SMP} clusters",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "34",
  number =       "10",
  pages =        "570--592",
  month =        oct,
  year =         "2008",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:09 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Jin:2008:PEM,
  author =       "Haoqiang Jin and Barbara Chapman and Lei Huang and
                 Dieter an Mey and Thomas Reichstein",
  title =        "Performance Evaluation of a Multi-Zone Application in
                 Different {OpenMP} Approaches",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "3",
  pages =        "312--325",
  month =        jun,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-008-0074-5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:10 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=3&spage=312",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Multi-level parallelism; OpenMP extensions;
                 Performance evaluation",
}

@Article{Kwon:2008:RPP,
  author =       "Seongnam Kwon and Yongjoo Kim and Woo-Chul Jeun and
                 Soonhoi Ha and Yunheung Paek",
  title =        "A retargetable parallel-programming framework for
                 {MPSoC}",
  journal =      j-TODAES,
  volume =       "13",
  number =       "3",
  pages =        "39:1--39:??",
  month =        jul,
  year =         "2008",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1367045.1367048",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Tue Aug 5 18:41:27 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "As more processing elements are integrated in a single
                 chip, embedded software design becomes more
                 challenging: It becomes a parallel programming for
                 nontrivial heterogeneous multiprocessors with diverse
                 communication architectures, and design constraints
                 such as hardware cost, power, and timeliness. In the
                 current practice of parallel programming with MPI or
                 OpenMP, the programmer should manually optimize the
                 parallel code for each target architecture and for the
                 design constraints. Thus, the design-space exploration
                 of MPSoC (multiprocessor systems-on-chip) costs become
                 prohibitively large as software development overhead
                 increases drastically. To solve this problem, we
                 develop a parallel-programming framework based on a
                 novel programming model called common intermediate code
                 (CIC). In a CIC, functional parallelism and data
                 parallelism of application tasks are specified
                 independently of the target architecture and design
                 constraints. Then, the CIC translator translates the
                 CIC into the final parallel code, considering the
                 target architecture and design constraints to make the
                 CIC retargetable. Experiments with preliminary
                 examples, including the H.263 decoder, show that the
                 proposed parallel-programming framework increases the
                 design productivity of MPSoC software significantly.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems (TODAES)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
  keywords =     "design-space exploration; embedded software;
                 multiprocessor system on chip; parallel-programming;
                 software generation",
}

@Article{Liu:2008:AMD,
  author =       "Weiguo Liu and Bertil Schmidt and Gerrit Voss and
                 Wolfgang M{\"u}ller-Wittig",
  title =        "Accelerating molecular dynamics simulations using
                 Graphics Processing Units with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "179",
  number =       "9",
  pages =        "634--641",
  day =          "1",
  month =        nov,
  year =         "2008",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2008.05.008",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:42:37 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465508002191",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Luckow:2008:MFT,
  author =       "Andr{\'e} Luckow and Bettina Schnor",
  title =        "{Migol}: a fault-tolerant service framework for {MPI}
                 applications in the {Grid}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "24",
  number =       "2",
  pages =        "142--152",
  month =        feb,
  year =         "2008",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Milovanovic:2008:NEE,
  author =       "Milos Milovanovi{\'c} and Roger Ferrer and Vladimir
                 Gajinov and Osman S. Unsal and Adrian Cristal and
                 Eduard Ayguad{\'e} and Mateo Valero",
  title =        "{Nebelung}: Execution Environment for Transactional
                 {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "3",
  pages =        "326--346",
  month =        jun,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-008-0073-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:10 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=3&spage=326",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Compiler; OpenMP; Runtime system; Software
                 Transactional Memory",
}

@Article{Nickolls:2008:SPP,
  author =       "John Nickolls and Ian Buck and Michael Garland and
                 Kevin Skadron",
  title =        "Scalable parallel programming with {CUDA}",
  journal =      j-QUEUE,
  volume =       "6",
  number =       "2",
  pages =        "40--53",
  month =        mar,
  year =         "2008",
  CODEN =        "AQCUAE",
  DOI =          "https://doi.org/10.1145/1365490.1365500",
  ISSN =         "1542-7730 (print), 1542-7749 (electronic)",
  ISSN-L =       "1542-7730",
  bibdate =      "Fri Jun 20 11:18:38 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/queue.bib",
  abstract =     "Is CUDA the parallel programming model that
                 application developers have been waiting for?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Queue: Tomorrow's Computing Today",
}

@Article{Noble:2008:GMY,
  author =       "Michael S. Noble",
  title =        "Getting more from your multicore: exploiting {OpenMP}
                 from an open-source numerical scripting language",
  journal =      j-CCPE,
  volume =       "20",
  number =       "16",
  pages =        "1877--1891",
  month =        nov,
  year =         "2008",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1296",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:28 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "4 Jun 2008",
}

@Article{OBrien:2008:SOC,
  author =       "Kevin O{\'B}rien and Kathryn O{\'B}rien and Zehra Sura
                 and Tong Chen and Tao Zhang",
  title =        "Supporting {OpenMP} on {Cell}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "3",
  pages =        "289--311",
  month =        jun,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-008-0072-7",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:10 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=3&spage=289",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Data transfer; Heterogeneous architecture; OpenMP;
                 Thread synchronization",
}

@Article{Patrick:2008:CEO,
  author =       "Christina M. Patrick and SeungWoo Son and Mahmut
                 Kandemir",
  title =        "Comparative evaluation of overlap strategies with
                 study of {I/O} overlap in {MPI-IO}",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "6",
  pages =        "43--49",
  month =        oct,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1453775.1453784",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Oct 23 14:23:29 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Many scientific applications use parallel I/O to meet
                 the low latency and high bandwidth I/O requirement.
                 Among many available parallel I/O operations,
                 collective I/O is one of the most popular methods when
                 the storage layouts and access patterns of data do not
                 match. The implementation of collective I/O typically
                 involves disk I/O operations followed by interprocessor
                 communications. Also, in many I/O-intensive
                 applications, parallel I/O operations are usually
                 followed by parallel computations. This paper presents
                 a comparative study of different overlap strategies in
                 parallel applications. We have experimented with four
                 different overlap strategies (1) Overlapping I/O and
                 communication; (2) Overlapping I/O and computation; (3)
                 Overlapping computation and communication; and (4)
                 Overlapping I/O, communication, and computation. All
                 experiments have been conducted on a Linux Cluster and
                 the performance results obtained are very encouraging.
                 On an average, we have enhanced the performance of a
                 generic collective read call by 38\%, the MxM benchmark
                 by 26\%, and the FFT benchmark by 34\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Rodriguez:2008:FTS,
  author =       "Gabriel Rodr{\'\i}guez and Xo{\'a}n C. Pardo and
                 Mar{\'\i}a J. Mart{\'\i}n and Patricia Gonz{\'a}lez and
                 Daniel D{\'\i}az",
  title =        "A Fault Tolerance Solution for Sequential and {MPI}
                 Applications on the {Grid}",
  journal =      j-SCPE,
  volume =       "9",
  number =       "2",
  pages =        "101--109",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  ISSN =         "1895-1767",
  bibdate =      "Thu Sep 2 11:55:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.scpe.org/content/9/2.toc",
  URL =          "http://www.scpe.org/vols/vol09/no2/SCPE_9_2_03.pdf;
                 http://www.scpe.org/vols/vol09/no2/SCPE_9_2_03.zip",
  acknowledgement = ack-nhfb,
}

@Article{Rolfe:2008:PFO,
  author =       "Timothy J. Rolfe",
  title =        "Perverse and foolish oft {I} strayed",
  journal =      j-SIGCSE,
  volume =       "40",
  number =       "2",
  pages =        "52--55",
  month =        jun,
  year =         "2008",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/1383602.1383634",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 15:44:13 MST 2012",
  bibsource =    "DBLP;
                 http://dblp.uni-trier.de/db/journals/sigcse/sigcse40.html#Rolfe08;
                 http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  URL =          "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Misc/DBLP/2008.bib",
  abstract =     "This uses a massively wrong-headed algorithm for
                 sorting to exemplify the use of the backtracking
                 strategy and the branch-and-bound strategy. In
                 addition, brief notes are included on parallel
                 processing approaches: Java threads on multi-core
                 computers and distributed processing through such
                 message passing systems as PVM and MPI.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Rolfe:2008:SMA,
  author =       "Timothy J. Rolfe",
  title =        "A specimen {MPI} application: {$N$}-Queens in
                 parallel",
  journal =      j-SIGCSE,
  volume =       "40",
  number =       "4",
  pages =        "42--45",
  month =        dec,
  year =         "2008",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/1473195.1473217",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 15:44:17 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  abstract =     "The generalized problem of placing n queens on an n
                 -by- n board provides an ``embarrassingly parallel''
                 problem for parallel solution. This paper expands on
                 the discussion presented in the May 2005 issue of Dr.
                 Dobb's Journal [1], specifically taking the parallel
                 execution through Java threads and bringing it into an
                 application in C taking advantage of MPI.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Sala:2008:PHP,
  author =       "Marzio Sala and W. F. Spotz and M. A. Heroux",
  title =        "{PyTrilinos}: {High-performance} distributed-memory
                 solvers for {Python}",
  journal =      j-TOMS,
  volume =       "34",
  number =       "2",
  pages =        "7:1--7:33",
  month =        mar,
  year =         "2008",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1326548.1326549",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Jun 12 12:47:31 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "PyTrilinos is a collection of Python modules that are
                 useful for serial and parallel scientific computing.
                 This collection contains modules that cover serial and
                 parallel dense linear algebra, serial and parallel
                 sparse linear algebra, direct and iterative linear
                 solution techniques, domain decomposition and
                 multilevel preconditioners, nonlinear solvers, and
                 continuation algorithms. Also included are a variety of
                 related utility functions and classes, including
                 distributed I/O, coloring algorithms, and matrix
                 generation. PyTrilinos vector objects are integrated
                 with the popular NumPy Python module, gathering
                 together a variety of high-level distributed computing
                 operations with serial vector
                 operations.\par

                 PyTrilinos is a set of interfaces to existing, compiled
                 libraries. This hybrid framework uses Python as
                 front-end, and efficient precompiled libraries for all
                 computationally expensive tasks. Thus, we take
                 advantage of both the flexibility and ease of use of
                 Python, and the efficiency of the underlying C++, C,
                 and FORTRAN numerical kernels. Out numerical results
                 show that, for many important problem classes, the
                 overhead required by the Python interpreter is
                 negligible.\par

                 To run in parallel, PyTrilinos simply requires a
                 standard Python interpreter. The fundamental MPI calls
                 are encapsulated under an abstract layer that manages
                 all interprocessor communications. This makes serial
                 and parallel scripts using PyTrilinos virtually
                 identical.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Mathematical Software",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "direct solvers; multilevel preconditioners; nonlinear
                 solvers; object-oriented programming; script
                 languages",
}

@Article{Schmitz:2008:IIG,
  author =       "Arne Schmitz and Markus Tavenrath and Leif Kobbelt",
  title =        "Illumination: Interactive Global Illumination for
                 Deformable Geometry in {CUDA}",
  journal =      j-CGF,
  volume =       "27",
  number =       "7",
  pages =        "1979--1986",
  month =        oct,
  year =         "2008",
  CODEN =        "CGFODY",
  DOI =          "https://doi.org/10.1111/j.1467-8659.2008.01347.x",
  ISSN =         "0167-7055 (print), 1467-8659 (electronic)",
  ISSN-L =       "0167-7055",
  bibdate =      "Sat May 11 13:27:05 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cgf.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Com{\-}pu{\-}ter Graphics Forum",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1467-8659/",
  onlinedate =   "23 Jan 2009",
}

@Article{Siegel:2008:CSE,
  author =       "Stephen F. Siegel and Anastasia Mironova and George S.
                 Avrunin and Lori A. Clarke",
  title =        "Combining symbolic execution with model checking to
                 verify parallel numerical programs",
  journal =      j-TOSEM,
  volume =       "17",
  number =       "2",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2008",
  CODEN =        "ATSMER",
  DOI =          "https://doi.org/10.1145/1348250.1348256",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Mon Jun 16 11:13:13 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tosem/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We present a method to verify the correctness of
                 parallel programs that perform complex numerical
                 computations, including computations involving
                 floating-point arithmetic. This method requires that a
                 sequential version of the program be provided, to serve
                 as the specification for the parallel one. The key idea
                 is to use model checking, together with symbolic
                 execution, to establish the equivalence of the two
                 programs. In this approach the path condition from
                 symbolic execution of the sequential program is used to
                 constrain the search through the parallel program. To
                 handle floating-point operations, three different types
                 of equivalence are supported. Several examples are
                 presented, demonstrating the approach and actual errors
                 that were found. Limitations and directions for future
                 research are also described.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J790",
  keywords =     "concurrency; Finite-state verification;
                 floating-point; high performance computing; Message
                 Passing Interface; model checking; MPI; numerical
                 program; parallel programming; Spin; symbolic
                 execution",
}

@Article{Valencia:2008:PPR,
  author =       "David Valencia and Alexey Lastovetsky and Maureen
                 O'Flynn and Antonio Plaza and Javier Plaza",
  title =        "Parallel Processing of Remotely Sensed Hyperspectral
                 Images on Heterogeneous Networks of Workstations Using
                 {HeteroMPI}",
  journal =      j-IJHPCA,
  volume =       "22",
  number =       "4",
  pages =        "386--407",
  month =        nov,
  year =         "2008",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342007088377",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/22/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/22/4/386.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{VanZee:2008:SPF,
  author =       "Field G. {Van Zee} and Paolo Bientinesi and Tze Meng
                 Low and Robert A. van de Geijn",
  title =        "Scalable parallelization of {FLAME} code via the
                 workqueuing model",
  journal =      j-TOMS,
  volume =       "34",
  number =       "2",
  pages =        "10:1--10:29",
  month =        mar,
  year =         "2008",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1326548.1326552",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Jun 12 12:47:31 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We discuss the OpenMP parallelization of linear
                 algebra algorithms that are coded using the Formal
                 Linear Algebra Methods Environment (FLAME) API. This
                 API expresses algorithms at a higher level of
                 abstraction, avoids the use loop and array indices, and
                 represents these algorithms as they are formally
                 derived and presented. We report on two implementations
                 of the workqueuing model, neither of which requires the
                 use of explicit indices to specify parallelism. The
                 first implementation uses the experimental taskq
                 pragma, which may influence the adoption of a similar
                 construct into OpenMP 3.0. The second workqueuing
                 implementation is domain-specific to FLAME but allows
                 us to illustrate the benefits of sorting tasks
                 according to their computational cost prior to parallel
                 execution. In addition, we discuss how scalable
                 parallelization of dense linear algebra algorithms via
                 OpenMP will require a two-dimensional partitioning of
                 operands much like a 2D data distribution is needed on
                 distributed memory architectures. We illustrate the
                 issues and solutions by discussing the parallelization
                 of the symmetric rank-$k$ update and report impressive
                 performance on an SGI system with 14 Itanium2
                 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Mathematical Software",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "FLAME; OpenMP; parallel; scalability; SMP;
                 workqueuing",
}

@Article{Wang:2008:PIM,
  author =       "Kun Wang and Yu Zhang and Huayong Wang and Xiaowei
                 Shen",
  title =        "Parallelization of {IBM Mambo} system simulator in
                 functional modes",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "1",
  pages =        "71--76",
  month =        jan,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1341312.1341325",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:19:29 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Mambo [4] is IBM's full-system simulator which models
                 PowerPC systems, and provides a complete set of
                 simulation tools to help IBM and its partners in
                 pre-hardware development and performance evaluation for
                 future systems. Currently Mambo simulates target
                 systems on a single host thread. When the number of
                 cores increases in a target system, Mambo's simulation
                 performance for each core goes down. As the so-called
                 `multi-core era' approaches, both target and host
                 systems will have more and more cores. It is very
                 important for Mambo to efficiently simulate a
                 multi-core target system on a multi-core host system.
                 Parallelization is a natural method to speed up Mambo
                 under this situation.\par

                 Parallel Mambo (P-Mambo) is a multi-threaded
                 implementation of Mambo. Mambo's simulation engine is
                 implemented as a user-level thread-scheduler. We
                 propose a multi-scheduler method to adapt Mambo's
                 simulation engine to multi-threaded execution. Based on
                 this method a core-based module partition is proposed
                 to achieve both high inter-scheduler parallelism and
                 low inter-scheduler dependency. Protection of shared
                 resources is crucial to both correctness and
                 performance of P-Mambo. Since there are two tiers of
                 threads in P-Mambo, protecting shared resources by only
                 OS-level locks possibly introduces deadlocks due to
                 user-level context switch. We propose a new lock
                 mechanism to handle this problem. Since Mambo is an
                 on-going project with many modules currently under
                 development, co-existence with new modules is also
                 important to P-Mambo. We propose a global-lock-based
                 method to guarantee compatibility of P-Mambo with
                 future Mambo modules.\par

                 We have implemented the first version of P-Mambo in
                 functional modes. The performance of P-Mambo has been
                 evaluated on the OpenMP implementation of NAS Parallel
                 Benchmark (NPB) 3.2 [12]. Preliminary experimental
                 results show that P-Mambo achieves an average speedup
                 of 3.4 on a 4-core host machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  keywords =     "architectural simulation; dynamic binary translation;
                 parallel simulation",
}

@Article{Wegiel:2008:MCVa,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The mapping collector: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Wegiel:2008:MCVb,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The {Mapping Collector}: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "2",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:20:12 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Wegiel:2008:MCVc,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The mapping collector: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "3",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:03:40 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Yang:2008:DPL,
  author =       "Chao-Tung Yang and Wen-Chung Shih and Shian-Shyong
                 Tseng",
  title =        "Dynamic partitioning of loop iterations on
                 heterogeneous {PC} clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "44",
  number =       "1",
  pages =        "1--23",
  month =        apr,
  year =         "2008",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0146-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:34 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=44&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=44&issue=1&spage=1",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Cluster computing; Heterogeneous; MPI programming;
                 Parallel loops; PC clusters; Self-scheduling",
}

@Article{Ayguade:2009:DOT,
  author =       "Eduard Ayguade and Nawal Copty and Alejandro Duran and
                 Jay Hoeflinger and Yuan Lin and Federico Massaioli and
                 Xavier Teruel and Priya Unnikrishnan and Guansong
                 Zhang",
  title =        "The Design of {OpenMP} Tasks",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "20",
  number =       "3",
  pages =        "404--418",
  month =        mar,
  year =         "2009",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2008.105",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu May 13 12:06:56 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Bikshandi:2009:EPI,
  author =       "Ganesh Bikshandi and Jose G. Castanos and Sreedhar B.
                 Kodali and V. Krishna Nandivada and Igor Peshansky and
                 Vijay A. Saraswat and Sayantan Sur and Pradeep Varma
                 and Tong Wen",
  title =        "Efficient, portable implementation of asynchronous
                 multi-place programs",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "271--282",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1594835.1504215",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The X10 programming language is organized around the
                 notion of places (an encapsulation of data and
                 activities operating on the data), partitioned global
                 address space (PGAS), and asynchronous computation and
                 communication.\par

                 This paper introduces an expressive subset of X10, Flat
                 X10, designed to permit efficient execution across
                 multiple single-threaded places with a simple runtime
                 and without compromising on the productivity of X10. We
                 present the design, implementation and evaluation of a
                 compiler and runtime system for Flat X10. The Flat X10
                 compiler translates programs into C++ SPMD programs
                 communicating using an active messaging infrastructure.
                 It uses novel techniques to transform explicitly
                 parallel programs into SPMD programs. The runtime
                 system is based on IBM's LAPI (Low-level API) and is
                 easily portable to other libraries such as GASNet and
                 ARMCI.\par

                 Our implementation realizes performance comparable to
                 hand-written MPI programs for well-known HPC benchmarks
                 such as Random Access, Stream, and FFT, on a
                 Federation-based cluster of Power5 SMPs (with hundreds
                 of processors) and the Blue Gene (with thousands of
                 processors). Submissions based on the work presented in
                 this paper were co-winners of the 2007 and 2008 HPC
                 Challenge Type II Awards.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "APGAS; asynchrony; compiler; FFT; HPC; HPC challenge;
                 PGAS; random access; runtime; SPMD; stream; X10",
}

@Article{Bronevetsky:2009:CAC,
  author =       "Greg Bronevetsky and John Gyllenhaal and Bronis R. de
                 Supinski",
  title =        "{CLOMP}: Accurately Characterizing {OpenMP}
                 Application Overheads",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "3",
  pages =        "250--265",
  month =        jun,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=250",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Cappello:2009:FSI,
  author =       "Franck Cappello and Thomas Herault and Jack Dongarra",
  title =        "Foreword: Special issue: selected papers from the
                 {14th European PVM\slash MPI Users Group Meeting}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "35",
  number =       "12",
  pages =        "571",
  year =         "2009",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2009.11.001",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  MRclass =      "68-06 (68M10 68M12)",
  MRnumber =     "MR2596831",
  bibdate =      "Sat Sep 4 17:11:07 2010",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Held in Paris, September 30--October 3, 2007",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing. Systems \& Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Duran:2009:PEO,
  author =       "Alejandro Duran and Roger Ferrer and Eduard
                 Ayguad{\'e} and Rosa M. Badia and Jesus Labarta",
  title =        "A Proposal to Extend the {OpenMP} Tasking Model with
                 Dependent Tasks",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "3",
  pages =        "292--305",
  month =        jun,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=292",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Dursun:2009:MPM,
  author =       "Hikmet Dursun and Kevin J. Barker and Darren J.
                 Kerbyson and Scott Pakin and Richard Seymour and Rajiv
                 K. Kalia and Aiichiro Nakano and Priya Vashishta",
  title =        "An {MPI} Performance Monitoring Interface for Cell
                 Based Compute Nodes",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "19",
  number =       "4",
  pages =        "535--552",
  month =        dec,
  year =         "2009",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626409000407",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:12 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{ElMaghraoui:2009:MIM,
  author =       "K. {El Maghraoui} and Travis J. Desell and Boleslaw K.
                 Szymanski and Carlos A. Varela",
  title =        "Malleable iterative {MPI} applications",
  journal =      j-CCPE,
  volume =       "21",
  number =       "3",
  pages =        "393--413",
  day =          "10",
  month =        mar,
  year =         "2009",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1362",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:30 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "1 Sep 2008",
}

@Article{Furlinger:2009:CAE,
  author =       "Karl F{\"u}rlinger and Shirley Moore",
  title =        "Capturing and Analyzing the Execution Control Flow of
                 {OpenMP} Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "3",
  pages =        "266--276",
  month =        jun,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=266",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Genaud:2009:FMP,
  author =       "St{\'e}phane Genaud and Emmanuel Jeannot and Choopan
                 Rattanapoka",
  title =        "Fault-Management in {P2P-MPI}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "5",
  pages =        "433--461",
  month =        oct,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:48 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=5&spage=433",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Giannoutakis:2009:DIP,
  author =       "Konstantinos M. Giannoutakis and George A. Gravvanis",
  title =        "Design and implementation of parallel approximate
                 inverse classes using {OpenMP}",
  journal =      j-CCPE,
  volume =       "21",
  number =       "2",
  pages =        "115--131",
  month =        feb,
  year =         "2009",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1324",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:30 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "6 Jun 2008",
}

@TechReport{Granat:2009:NPQ,
  author =       "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel
                 Kressner",
  title =        "A novel parallel {QR} algorithm for hybrid distributed
                 memory {HPC} systems",
  type =         "LAPACK Working Note",
  number =       "216",
  institution =  "Department of Computing Science and HPC2N",
  address =      "Ume{\aa} University, S-901 Ume{\aa}, Sweden",
  month =        apr,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn216.pdf",
  abstract =     "A novel variant of the parallel QR algorithm for
                 solving dense nonsymmetric eigenvalue problems on
                 hybrid distributed high performance computing (HPC)
                 systems is presented. For this purpose, we introduce
                 the concept of multi-window bulge chain chasing and
                 parallelize aggressive early deflation. The
                 multi-window approach ensures that most computations
                 when chasing chains of bulges are performed in level 3
                 BLAS operations, while the aim of aggressive early
                 deflation is to speed up the convergence of the QR
                 algorithm. Mixed MPI-OpenMP coding techniques are
                 utilized for porting the codes to distributed memory
                 platforms with multithreaded nodes, such as multicore
                 processors. Numerous numerical experiments confirm the
                 superior performance of our parallel QR algorithm in
                 comparison with the existing ScaLAPACK code, leading to
                 an implementation that is one to two orders of
                 magnitude faster for sufficiently large problems,
                 including a number of examples from applications.",
  acknowledgement = ack-nhfb,
  keywords =     "aggressive early deflation; bulge chasing; Eigenvalue
                 problem; hybrid distributed memory systems.; level 3
                 performance; multishift; nonsymmetric QR algorithm;
                 parallel algorithms; parallel computations",
  utknumber =    "UMINF-09.06",
}

@Article{Gravvanis:2009:OBP,
  author =       "George A. Gravvanis",
  title =        "{OpenMP} based parallel normalized direct methods for
                 sparse finite element linear systems",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "47",
  number =       "1",
  pages =        "44--52",
  month =        jan,
  year =         "2009",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Aug 25 08:38:28 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=47&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=47&issue=1&spage=44",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Hadjidoukas:2009:HPF,
  author =       "P. E. Hadjidoukas and V. V. Dimakopoulos and M.
                 Delakis and C. Garcia",
  title =        "A high-performance face detection system using
                 {OpenMP}",
  journal =      j-CCPE,
  volume =       "21",
  number =       "15",
  pages =        "1819--1837",
  month =        oct,
  year =         "2009",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1389",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:38 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "26 Mar 2009",
}

@Article{He:2009:AVS,
  author =       "Jian He and Layne T. Watson and Masha Sosonkina",
  title =        "{Algorithm 897}: {VTDIRECT95}: {Serial} and parallel
                 codes for the global optimization algorithm direct",
  journal =      j-TOMS,
  volume =       "36",
  number =       "3",
  pages =        "17:1--17:24",
  month =        jul,
  year =         "2009",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1527286.1527291",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Jul 21 14:09:07 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  note =         "See remark \cite{Sosonkina:2015:RAV}.",
  abstract =     "VTDIRECT95 is a Fortran 95 implementation of D. R.
                 Jones' deterministic global optimization algorithm
                 called {\em DIRECT}, which is widely used in
                 multidisciplinary engineering design, biological
                 science, and physical science applications. The package
                 includes both a serial code and a data-distributed
                 massively parallel code for different problem scales
                 and optimization (exploration vs. exploitation) goals.
                 Dynamic data structures are used to organize local
                 data, handle unpredictable memory requirements, reduce
                 the memory usage, and share the data across multiple
                 processors. The parallel code employs a multilevel
                 functional and data parallelism to boost concurrency
                 and mitigate the data dependency, thus improving the
                 load balancing and scalability. In addition,
                 checkpointing features are integrated into both
                 versions to provide fault tolerance and hot restarts.
                 Important algorithm modifications and design
                 considerations are discussed regarding data structures,
                 parallel schemes, error handling, and portability.
                 Using several benchmark functions and real-world
                 applications, the software is evaluated on different
                 systems in terms of optimization effectiveness, data
                 structure efficiency, parallel performance, and
                 checkpointing overhead. The package organization and
                 usage are also described in detail.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "checkpointing; data structures; DIRECT; global
                 optimization; parallel schemes",
}

@Article{Hilbrich:2009:MCC,
  author =       "Tobias Hilbrich and Matthias S. M{\"u}ller and Bettina
                 Krammer",
  title =        "{MPI} Correctness Checking for {OpenMP\slash MPI}
                 Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "3",
  pages =        "277--291",
  month =        jun,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=277",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Hong:2009:AMG,
  author =       "Sunpyo Hong and Hyesoon Kim",
  title =        "An analytical model for a {GPU} architecture with
                 memory-level and thread-level parallelism awareness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "152--163",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555775",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPU architectures are increasingly important in the
                 multi-core era due to their high number of parallel
                 processors. Programming thousands of massively parallel
                 threads is a big challenge for software engineers, but
                 understanding the performance bottlenecks of those
                 parallel programs on GPU architectures to improve
                 application performance is even more difficult. Current
                 approaches rely on programmers to tune their
                 applications by exploiting the design space
                 exhaustively without fully understanding the
                 performance characteristics of their
                 applications.\par

                 To provide insights into the performance bottlenecks of
                 parallel applications on GPU architectures, we propose
                 a simple analytical model that estimates the execution
                 time of massively parallel programs. The key component
                 of our model is estimating the number of parallel
                 memory requests (we call this the memory warp
                 parallelism) by considering the number of running
                 threads and memory bandwidth. Based on the degree of
                 memory warp parallelism, the model estimates the cost
                 of memory requests, thereby estimating the overall
                 execution time of a program. Comparisons between the
                 outcome of the model and the actual execution time in
                 several GPUs show that the geometric mean of absolute
                 error of our model on micro-benchmarks is 5.4\% and on
                 GPU computing applications is 13.3\%. All the
                 applications are written in the CUDA programming
                 language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "analytical model; CUDA; GPU architecture; memory level
                 parallelism; performance estimation; warp level
                 parallelism",
}

@Article{Huang:2009:EGO,
  author =       "Lei Huang and Deepak Eachempati and Marcus W. Hervey
                 and Barbara Chapman",
  title =        "Exploiting global optimizations for {OpenMP} programs
                 in the {OpenUH} compiler",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "289--290",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504219",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The advent of new parallel architectures has increased
                 the need for parallel optimizing compilers to assist
                 developers in creating efficient code. OpenUH is a
                 state-of-the-art optimizing compiler, but it only
                 performs a limited set of optimizations for OpenMP
                 programs due to its conservative assumptions of shared
                 memory programming. These limitations may prevent some
                 OpenMP applications from being fully optimized to the
                 extent of its sequential counterpart. This paper
                 describes our design and implementation of a parallel
                 data flow framework, consisting of a Parallel Control
                 Flow Graph (PCFG) and a Parallel SSA (PSSA)
                 representation in OpenUH, to model data flow for OpenMP
                 programs. This framework enables the OpenUH compiler to
                 perform all classical scalar optimizations for OpenMP
                 programs, in addition to conducting OpenMP specific
                 optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "compiler analysis; OpenMP; Parallel SSA",
}

@Article{Kainz:2009:RCM,
  author =       "Bernhard Kainz and Markus Grabner and Alexander Bornik
                 and Stefan Hauswiesner and Judith Muehl and Dieter
                 Schmalstieg",
  title =        "Ray casting of multiple volumetric datasets with
                 polyhedral boundaries on manycore {GPUs}",
  journal =      j-TOG,
  volume =       "28",
  number =       "5",
  pages =        "152:1--152:9",
  month =        dec,
  year =         "2009",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/1618452.1618498",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Mon Mar 15 09:01:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "We present a new GPU-based rendering system for ray
                 casting of multiple volumes. Our approach supports a
                 large number of volumes, complex translucent and
                 concave polyhedral objects as well as CSG intersections
                 of volumes and geometry in any combination. The system
                 (including the rasterization stage) is implemented
                 entirely in CUDA, which allows full control of the
                 memory hierarchy, in particular access to high
                 bandwidth and low latency shared memory. High depth
                 complexity, which is problematic for conventional
                 approaches based on depth peeling, can be handled
                 successfully. As far as we know, our approach is the
                 first framework for multivolume rendering which
                 provides interactive frame rates when concurrently
                 rendering more than 50 arbitrarily overlapping volumes
                 on current graphics hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "152",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Klemm:2009:RTM,
  author =       "Michael Klemm and Matthias Bezold and Stefan Gabriel
                 and Ronald Veldema and Michael Philippsen",
  title =        "Reparallelization techniques for migrating {OpenMP}
                 codes in computational grids",
  journal =      j-CCPE,
  volume =       "21",
  number =       "3",
  pages =        "281--299",
  day =          "10",
  month =        mar,
  year =         "2009",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1356",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:30 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "6 Aug 2008",
}

@InProceedings{Klimach:2009:PCH,
  author =       "Harald Klimach and Sabine P. Roller",
  title =        "Parallel Coupling of Heterogeneous Domains with
                 {KOP3D} using {PACX-MPI}",
  crossref =     "Tuncer:2009:PCF",
  volume =       "67",
  pages =        "339--345",
  year =         "2009",
  DOI =          "https://doi.org/10.1007/978-3-540-92744-0_42",
  bibdate =      "Sat Dec 22 08:34:16 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-540-92744-0_42",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-540-92744-0",
  book-URL =     "http://www.springerlink.com/content/978-3-540-92744-0",
}

@Article{Komatitsch:2009:PHO,
  author =       "Dimitri Komatitsch and David Mich{\'e}a and Gordon
                 Erlebacher",
  title =        "Porting a high-order finite-element earthquake
                 modeling application to {NVIDIA} graphics cards using
                 {CUDA}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "69",
  number =       "5",
  pages =        "451--460",
  month =        may,
  year =         "2009",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 17:08:39 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@InProceedings{Langdon:2009:FHQ,
  author =       "W. B. Langdon",
  editor =       "Franz Rothlauf",
  booktitle =    "{GECCO '09 Proceedings of the 11th Annual Conference
                 Companion on Genetic and Evolutionary Computation
                 Conference: Late Breaking Papers}",
  title =        "A fast high quality pseudo random number generator for
                 {nVidia CUDA}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "2511--2513",
  year =         "2009",
  DOI =          "https://doi.org/10.1145/1570256.1570353",
  ISBN =         "1-60558-505-X",
  ISBN-13 =      "978-1-60558-505-5",
  LCCN =         "????",
  bibdate =      "Fri Jan 06 09:34:05 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.cs.ucl.ac.uk/staff/W.Langdon/ftp/gp-code/random-numbers/cuda_park-miller.tar.gz",
  acknowledgement = ack-nhfb,
  keywords =     "GGL generator (LCG(16 807, 0, $2^{31} - 1$))",
  meetingname =  "Proceedings of the 11th annual Conference Companion on
                 Genetic and Evolutionary Computation: July 8--12, 2009,
                 Montreal, Quebec, Canada",
}

@Article{Lee:2009:OGC,
  author =       "Seyong Lee and Seung-Jai Min and Rudolf Eigenmann",
  title =        "{OpenMP} to {GPGPU}: a compiler framework for
                 automatic translation and optimization",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "101--110",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "GPGPUs have recently emerged as powerful vehicles for
                 general-purpose high-performance computing. Although a
                 new Compute Unified Device Architecture (CUDA)
                 programming model from NVIDIA offers improved
                 programmability for general computing, programming
                 GPGPUs is still complex and error-prone. This paper
                 presents a compiler framework for automatic
                 source-to-source translation of standard OpenMP
                 applications into CUDA-based GPGPU applications. The
                 goal of this translation is to further improve
                 programmability and make existing OpenMP applications
                 amenable to execution on GPGPUs. In this paper, we have
                 identified several key transformation techniques, which
                 enable efficient GPU global memory access, to achieve
                 high performance. Experimental results from two
                 important kernels (JACOBI and SPMUL) and two NAS OpenMP
                 Parallel Benchmarks (EP and CG) show that the described
                 translator and compile-time optimizations work well on
                 both regular and irregular applications, leading to
                 performance improvements of up to 50X over the
                 unoptimized translation (up to 328X over serial).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "automatic translation; compiler optimization; CUDA;
                 GPU; OpenMP",
}

@Article{Ma:2009:CRS,
  author =       "Wenjing Ma and Gagan Agrawal",
  title =        "A compiler and runtime system for enabling data mining
                 applications on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "287--288",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1594835.1504218",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2000.bib",
  abstract =     "With increasing need for accelerating data mining and
                 scientific data analysis on large data sets, and less
                 chance to improve processor performance by simply
                 increasing clock frequencies, multi-core architectures
                 and accelerators like FPGAs and GPUs have become
                 popular. A recent development in using GPU for general
                 computing has been the release of CUDA (Compute Unified
                 Device Architecture) by NVIDIA. CUDA allows GPU
                 programming with C-language-like features, thus easing
                 the development of non-graphics applications on a GPU.
                 However, several challenges still remain in programming
                 the GPUs with CUDA, because CUDA involves explicit
                 parallel programming and management of its complex
                 memory hierarchy, as well as allocating device memory,
                 moving data between CPU and device memory, and
                 specification of thread grid configurations.\par

                 In this paper, we offer a solution for the programmers
                 to generate CUDA code by specifying the sequential
                 reduction loop(s) with some information about the
                 parameters. With program analysis and code generation,
                 the applications are mapped to a GPU. Several
                 additional optimizations are also performed by the
                 middleware.\par

                 We have evaluated our system using three popular data
                 mining applications, k-means clustering, EM clustering,
                 and Principal Component Analysis (PCA). The speedup
                 that each of these applications achieve over a
                 sequential CPU version ranges between 20 and 50.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "CUDA; data mining; GPGPU",
}

@Article{Marowka:2009:BCT,
  author =       "Ami Marowka",
  title =        "{BSP2OMP}: a Compiler For Translating {BSP} Programs
                 To {OpenMP}",
  journal =      j-INT-J-PAR-EMER-DIST-SYS,
  volume =       "24",
  number =       "4",
  pages =        "293--310",
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1744-5760 (print), 1744-5779 (electronic)",
  ISSN-L =       "1744-5760",
  bibdate =      "Thu Sep 2 08:12:37 MDT 2010",
  bibsource =    "http://www.informaworld.com/smpp/title~content=t713729127~link=cover;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
  keywords =     "BSP; BSP2OMP; EPCC; multicore; OpenMP",
}

@Article{Miguel-Alonso:2009:INS,
  author =       "J. Miguel-Alonso and J. Navaridas and F. J. Ridruejo",
  title =        "Interconnection Network Simulation Using Traces of
                 {MPI} Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "2",
  pages =        "153--174",
  month =        apr,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=2&spage=153",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Ozgun:2009:PCB,
  author =       "Ozlem Ozgun and Raj Mittra and Mustafa
                 Kuzuo{\u{g}}lu",
  title =        "Parallelized Characteristic Basis Finite Element
                 Method ({CBFEM--MPI}) --- a non-iterative domain
                 decomposition algorithm for electromagnetic scattering
                 problems",
  journal =      j-J-COMPUT-PHYS,
  volume =       "228",
  number =       "6",
  pages =        "2225--2238",
  day =          "1",
  month =        apr,
  year =         "2009",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1016/j.jcp.2008.12.002",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Mon Jan 2 22:14:07 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999108006293",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Rashti:2009:SAM,
  author =       "Mohammad J. Rashti and Ahmad Afsahi",
  title =        "A Speculative and Adaptive {MPI} Rendezvous Protocol
                 Over {RDMA}-enabled Interconnects",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "2",
  pages =        "223--246",
  month =        apr,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=2&spage=223",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Schneider:2009:CPM,
  author =       "Scott Schneider and Jae-Seung Yeom and Benjamin Rose
                 and John C. Linford and Adrian Sandu and Dimitrios S.
                 Nikolopoulos",
  title =        "A comparison of programming models for multiprocessors
                 with explicitly managed memory hierarchies",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "131--140",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1594835.1504197",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "On multiprocessors with explicitly managed memory
                 hierarchies (EMM), software has the responsibility of
                 moving data in and out of fast local memories. This
                 task can be complex and error-prone even for expert
                 programmers. Before we can allow compilers to handle
                 this complexity for us, we must identify the
                 abstractions that are general enough to allow us to
                 write applications with reasonable effort, yet specific
                 enough to exploit the vast on-chip memory bandwidth of
                 EMM multi-processors. To this end, we compare two
                 programming models against hand-tuned codes on the STI
                 Cell, paying attention to programmability and
                 performance. The first programming model, Sequoia,
                 abstracts the memory hierarchy as private address
                 spaces, each corresponding to a parallel task. The
                 second, Cellgen, is a new framework which provides
                 OpenMP-like semantics and the abstraction of a shared
                 address space divided into private and shared data. We
                 compare three applications programmed using these
                 models against their hand-optimized counterparts in
                 terms of abstractions, programming complexity, and
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cell be; explicitly managed memory hierarchies;
                 programming models",
}

@Article{Schwarz:2009:GFG,
  author =       "Michael Schwarz and Marc Stamminger",
  title =        "{GPU}: Fast {GPU}-based Adaptive Tessellation with
                 {CUDA}",
  journal =      j-CGF,
  volume =       "28",
  number =       "2",
  pages =        "365--374",
  month =        apr,
  year =         "2009",
  CODEN =        "CGFODY",
  DOI =          "https://doi.org/10.1111/j.1467-8659.2009.01376.x",
  ISSN =         "0167-7055 (print), 1467-8659 (electronic)",
  ISSN-L =       "0167-7055",
  bibdate =      "Sat May 11 13:27:16 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cgf.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Com{\-}pu{\-}ter Graphics Forum",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1467-8659/",
  onlinedate =   "27 Mar 2009",
}

@Article{Tabakin:2009:QPE,
  author =       "Frank Tabakin and Bruno Juli{\'a}-D{\'\i}az",
  title =        "{QCMPI}: a parallel environment for quantum
                 computing",
  journal =      j-COMP-PHYS-COMM,
  volume =       "180",
  number =       "6",
  pages =        "948--964",
  month =        jun,
  year =         "2009",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2008.11.021",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:42:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465508004141",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Tallent:2009:EPM,
  author =       "Nathan R. Tallent and John M. Mellor-Crummey",
  title =        "Effective performance measurement and analysis of
                 multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "229--240",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Understanding why the performance of a multithreaded
                 program does not improve linearly with the number of
                 cores in a shared-memory node populated with one or
                 more multicore processors is a problem of growing
                 practical importance. This paper makes three
                 contributions to performance analysis of multithreaded
                 programs. First, we describe how to measure and
                 attribute {\em parallel idleness}, namely, where
                 threads are stalled and unable to work. This technique
                 applies broadly to programming models ranging from
                 explicit threading ({\em e.g.}, Pthreads) to
                 higher-level models such as Cilk and OpenMP. Second, we
                 describe how to measure and attribute {\em parallel
                 overhead\/} -- when a thread is performing
                 miscellaneous work other than executing the user's
                 computation. By employing a combination of compiler
                 support and post-mortem analysis, we incur no
                 measurement cost beyond normal profiling to glean this
                 information. Using {\em idleness\/} and {\em
                 overhead\/} metrics enables one to pinpoint areas of an
                 application where concurrency should be increased (to
                 reduce idleness), decreased (to reduce overhead), or
                 where the present parallelization is hopeless (where
                 idleness and overhead are both high). Third, we
                 describe how to measure and attribute arbitrary
                 performance metrics for high-level multithreaded
                 programming models, such as Cilk. This requires
                 bridging the gap between the expression of logical
                 concurrency in programs and its realization at run-time
                 as it is adaptively partitioned and scheduled onto a
                 pool of threads. We have prototyped these ideas in the
                 context of Rice University's HPCToolkit performance
                 tools. We describe our approach, implementation, and
                 experiences applying this approach to measure and
                 attribute work, idleness, and overhead in executions of
                 Cilk programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "call path profiling; hpctoolkit; multithreaded
                 programming models; performance analysis",
}

@Article{Thakur:2009:TSE,
  author =       "Rajeev Thakur and William Gropp",
  title =        "Test suite for evaluating performance of multithreaded
                 {MPI} communication",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "35",
  number =       "12",
  pages =        "608--617",
  month =        dec,
  year =         "2009",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:11 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Tournavitis:2009:THA,
  author =       "Georgios Tournavitis and Zheng Wang and Bj{\"o}rn
                 Franke and Michael F. P. O'Boyle",
  title =        "Towards a holistic approach to auto-parallelization:
                 integrating profile-driven parallelism detection and
                 machine-learning based mapping",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "177--187",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1542476.1542496",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Compiler-based auto-parallelization is a much studied
                 area, yet has still not found wide-spread application.
                 This is largely due to the poor exploitation of
                 application parallelism, subsequently resulting in
                 performance levels far below those which a skilled
                 expert programmer could achieve. We have identified two
                 weaknesses in traditional parallelizing compilers and
                 propose a novel, integrated approach, resulting in
                 significant performance improvements of the generated
                 parallel code. Using profile-driven parallelism
                 detection we overcome the limitations of static
                 analysis, enabling us to identify more application
                 parallelism and only rely on the user for final
                 approval. In addition, we replace the traditional
                 target-specific and inflexible mapping heuristics with
                 a machine-learning based prediction mechanism,
                 resulting in better mapping decisions while providing
                 more scope for adaptation to different target
                 architectures. We have evaluated our parallelization
                 strategy against the NAS and SPEC OMP benchmarks and
                 two different multi-core platforms (dual quad-core
                 Intel Xeon SMP and dual-socket QS20 Cell blade). We
                 demonstrate that our approach not only yields
                 significant improvements when compared with
                 state-of-the-art parallelizing compilers, but comes
                 close to and sometimes exceeds the performance of
                 manually parallelized codes. On average, our
                 methodology achieves 96\% of the performance of the
                 hand-tuned OpenMP NAS and SPEC parallel benchmarks on
                 the Intel Xeon platform and gains a significant speedup
                 for the IBM Cell platform, demonstrating the potential
                 of profile-guided and machine-learning based
                 parallelization for complex multi-core platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "auto-parallelization; machine-learning based
                 parallelism mapping; OpenMP; profile-driven parallelism
                 detection",
}

@Article{Udupa:2009:SES,
  author =       "Abhishek Udupa and R. Govindarajan and Matthew J.
                 Thazhuthaveetil",
  title =        "Synergistic execution of stream programs on multicores
                 with accelerators",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "7",
  pages =        "99--108",
  month =        jul,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1542452.1542466",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jun 26 12:07:39 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2000.bib",
  abstract =     "The StreamIt programming model has been proposed to
                 exploit parallelism in streaming applications on
                 general purpose multicore architectures. The StreamIt
                 graphs describe task, data and pipeline parallelism
                 which can be exploited on accelerators such as Graphics
                 Processing Units (GPUs) or CellBE which support
                 abundant parallelism in hardware.\par

                 In this paper, we describe a novel method to
                 orchestrate the execution of a StreamIt program on a
                 multicore platform equipped with an accelerator. The
                 proposed approach identifies, using profiling, the
                 relative benefits of executing a task on the
                 superscalar CPU cores and the accelerator. We formulate
                 the problem of partitioning the work between the CPU
                 cores and the GPU, taking into account the latencies
                 for data transfers and the required buffer layout
                 transformations associated with the partitioning, as an
                 integrated Integer Linear Program (ILP) which can then
                 be solved by an ILP solver. We also propose an
                 efficient heuristic algorithm for the work partitioning
                 between the CPU and the GPU, which provides solutions
                 which are within 9.05\% of the optimal solution on an
                 average across the benchmark suite. The partitioned
                 tasks are then software pipelined to execute on the
                 multiple CPU cores and the Streaming Multiprocessors
                 (SMs) of the GPU. The software pipelining algorithm
                 orchestrates the execution between CPU cores and the
                 GPU by emitting the code for the CPU and the GPU, and
                 the code for the required data transfers. Our
                 experiments on a platform with 8 CPU cores and a
                 GeForce 8800 GTS 512 GPU show a geometric mean speedup
                 of 6.84X with a maximum of 51.96X over a single
                 threaded CPU execution across the StreamIt benchmarks.
                 This is a 18.9\% improvement over a partitioning
                 strategy that maps only the filters that cannot be
                 executed on the GPU -- the filters with state that is
                 persistent across firings -- onto the CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "CUDA; GPU programming; partitioning; software
                 pipelining; stream programming",
}

@Article{Vo:2009:FVP,
  author =       "Anh Vo and Sarvani Vakkalanka and Michael DeLisi and
                 Ganesh Gopalakrishnan and Robert M. Kirby and Rajeev
                 Thakur",
  title =        "Formal verification of practical {MPI} programs",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "261--270",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1594835.1504214",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "This paper considers the problem of formal
                 verification of MPI programs operating under a fixed
                 test harness for safety properties without building
                 verification models. In our approach, we directly
                 model-check the MPI/C source code, executing its
                 interleavings with the help of a verification
                 scheduler. Unfortunately, the total feasible number of
                 interleavings is exponential, and impractical to
                 examine even for our modest goals. Our earlier
                 publications formalized and implemented a partial order
                 reduction approach that avoided exploring equivalent
                 interleavings, and presented a verification tool called
                 ISP. This paper presents algorithmic and engineering
                 innovations to ISP, including the use of OpenMP
                 parallelization, that now enables it to handle
                 practical MPI programs, including:(i) ParMETIS --- a
                 widely used hypergraph partitioner, and (ii) MADRE ---
                 a Memory Aware Data Re-distribution Engine, both
                 developed outside our group. Over these benchmarks, ISP
                 has automatically verified up to 14K lines of MPI/C
                 code, producing error traces of deadlocks and assertion
                 violations within seconds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "distributed programming; dynamic partial order
                 reduction; message passing interface; model checking;
                 MPI",
}

@Article{Walters:2009:RBF,
  author =       "John Paul Walters and Vipin Chaudhary",
  title =        "Replication-Based Fault Tolerance for {MPI}
                 Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "20",
  number =       "7",
  pages =        "997--1010",
  month =        jul,
  year =         "2009",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2008.172",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu May 13 12:06:56 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Wang:2009:MPM,
  author =       "Zheng Wang and Michael F. P. O'Boyle",
  title =        "Mapping parallelism to multi-cores: a machine learning
                 based approach",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "75--84",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504189",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The efficient mapping of program parallelism to
                 multi-core processors is highly dependent on the
                 underlying architecture. This paper proposes a portable
                 and automatic compiler-based approach to mapping such
                 parallelism using machine learning. It develops two
                 predictors: a data sensitive and a data insensitive
                 predictor to select the best mapping for parallel
                 programs. They predict the number of threads and the
                 scheduling policy for any given program using a model
                 learnt off-line. By using low-cost profiling runs, they
                 predict the mapping for a new unseen program across
                 multiple input data sets. We evaluate our approach by
                 selecting parallelism mapping configurations for OpenMP
                 programs on two representative but different multi-core
                 platforms (the Intel Xeon and the Cell processors).
                 Performance of our technique is stable across programs
                 and architectures. On average, it delivers above 96\%
                 performance of the maximum available on both platforms.
                 It achieve, on average, a 37\% (up to 17.5 {\em
                 times\/}) performance improvement over the OpenMP
                 runtime default scheme on the Cell platform. Compared
                 to two recent prediction models, our predictors achieve
                 better performance with a significant lower profiling
                 cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "artificial neural networks; compiler optimization;
                 machine learning; performance modeling; support vector
                 machine",
}

@Article{Xue:2009:MSR,
  author =       "Ruini Xue and Xuezheng Liu and Ming Wu and Zhenyu Guo
                 and Wenguang Chen and Weimin Zheng and Zheng Zhang and
                 Geoffrey Voelker",
  title =        "{MPIWiz}: subgroup reproducible replay of {MPI}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "251--260",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504213",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Message Passing Interface (MPI) is a widely used
                 standard for managing coarse-grained concurrency on
                 distributed computers. Debugging parallel MPI
                 applications, however, has always been a particularly
                 challenging task due to their high degree of concurrent
                 execution and non-deterministic behavior. Deterministic
                 replay is a potentially powerful technique for
                 addressing these challenges, with existing MPI replay
                 tools adopting either data-replay or order-replay
                 approaches. Unfortunately, each approach has its
                 tradeoffs. Data-replay generates substantial log sizes
                 by recording every communication message. Order-replay
                 generates small logs, but requires all processes to be
                 replayed together. We believe that these drawbacks are
                 the primary reasons that inhibit the wide adoption of
                 deterministic replay as the critical enabler of cyclic
                 debugging of MPI applications.\par

                 This paper describes {\em subgroup reproducible
                 replay\/} (SRR), a hybrid deterministic replay method
                 that provides the benefits of both data-replay and
                 order-replay while balancing their trade-offs. SRR
                 divides all processes into disjoint groups. It records
                 the contents of messages crossing group boundaries as
                 in data-replay, but records just message orderings for
                 communication within a group as in order-replay. In
                 this way, SRR can exploit the communication locality of
                 traffic patterns in MPI applications. During replay,
                 developers can then replay each group individually. SRR
                 reduces recording overhead by not recording intra-group
                 communication, and reduces replay overhead by limiting
                 the size of each replay group. Exposing these tradeoffs
                 gives the user the necessary control for making
                 deterministic replay practical for MPI
                 applications.\par

                 We have implemented a prototype, MPIWiz, to demonstrate
                 and evaluate SRR. MPIWiz employs a replay framework
                 that allows transparent binary instrumentation of both
                 library and system calls. As a result, MPIWiz replays
                 MPI applications with no source code modification and
                 relinking, and handles non-determinism in both MPI and
                 OS system calls. Our preliminary results show that
                 MPIWiz can reduce recording overhead by over a factor
                 of four relative to data-replay, yet without requiring
                 the entire application to be replayed as in
                 order-replay. Recording increases execution time by
                 27\% while the application can be replayed in just 53\%
                 of its base execution time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "distributed debugging; message passing interface;
                 non-determinism; record and replay",
}

@Article{Yang:2009:DBM,
  author =       "Chao-Tung Yang and Kuan-Chou Lai",
  title =        "A directive-based {MPI} code generator for {Linux PC}
                 clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "50",
  number =       "2",
  pages =        "177--207",
  month =        nov,
  year =         "2009",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Aug 25 08:38:43 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=50&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=50&issue=2&spage=177",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@InProceedings{Yilmaz:2009:HPC,
  author =       "E. Yilmaz and R. U. Payli and H. U. Akay and A. Ecer",
  title =        "Hybrid Parallelism for {CFD} Simulations: Combining
                 {MPI} with {OpenMP}",
  crossref =     "Tuncer:2009:PCF",
  volume =       "67",
  pages =        "401--408",
  year =         "2009",
  DOI =          "https://doi.org/10.1007/978-3-540-92744-0_50",
  bibdate =      "Sat Dec 22 08:34:16 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-540-92744-0_50",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-540-92744-0",
  book-URL =     "http://www.springerlink.com/content/978-3-540-92744-0",
}

@Article{Ayguade:2010:EOS,
  author =       "Eduard Ayguad{\'e} and Rosa M. Badia and Pieter
                 Bellens and Daniel Cabrera and Alejandro Duran Roger
                 Ferrer and Marc Gonz{\'a}lez and Francisco Igual and
                 Daniel Jim{\'e}nez-Gonz{\'a}lez and Jes{\'u}s Labarta
                 and Luis Martinell and Xavier Martorell and Rafael Mayo
                 and Josep M. P{\'e}rez and Judit Planas and Enrique S.
                 Quintana-Ort{\'\i}",
  title =        "Extending {OpenMP} to Survive the Heterogeneous
                 Multi-Core Era",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "38",
  number =       "5--6",
  pages =        "440--459",
  month =        oct,
  year =         "2010",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:49 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=38&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=38&issue=5&spage=440",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Baghsorkhi:2010:APM,
  author =       "Sara S. Baghsorkhi and Matthieu Delahaye and Sanjay J.
                 Patel and William D. Gropp and Wen-mei W. Hwu",
  title =        "An adaptive performance modeling tool for {GPU}
                 architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "105--114",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693470",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents an analytical model to predict the
                 performance of\par

                 general-purpose applications on a GPU architecture. The
                 model is designed to provide performance information to
                 an auto-tuning compiler and assist it in narrowing down
                 the search to the more promising implementations. It
                 can also be incorporated into a tool to help
                 programmers better assess the performance bottlenecks
                 in their code. We analyze each GPU kernel and identify
                 how the kernel exercises major GPU microarchitecture
                 features. To identify the performance bottlenecks
                 accurately, we introduce an abstract interpretation of
                 a GPU kernel, {\em work flow graph}, based on which we
                 estimate the execution time of a GPU kernel. We
                 validated our performance model on the NVIDIA GPUs
                 using CUDA (Compute Unified Device Architecture). For
                 this purpose, we used data parallel benchmarks that
                 stress different GPU microarchitecture events such as
                 uncoalesced memory accesses, scratch-pad memory bank
                 conflicts, and control flow divergence, which must be
                 accurately modeled but represent challenges to the
                 analytical performance models. The proposed model
                 captures full system complexity and shows high accuracy
                 in predicting the performance trends of different
                 optimized kernel implementations. We also describe our
                 approach to extracting the performance model
                 automatically from a kernel code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "analytical model; GPU; parallel programming;
                 performance estimation",
}

@Article{Balaji:2010:FGM,
  author =       "Pavan Balaji and Darius Buntinas and David Goodell and
                 William Gropp and Rajeev Thakur",
  title =        "Fine-Grained Multithreading Support for Hybrid
                 Threaded {MPI} Programming",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "1",
  pages =        "49--57",
  month =        feb,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009360206",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/1/49.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Balaji:2010:IND,
  author =       "Pavan Balaji and Anthony Chan and William Gropp and
                 Rajeev Thakur and Ewing Lusk",
  title =        "The Importance of Non-Data-Communication Overheads in
                 {MPI}",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "1",
  pages =        "5--15",
  month =        feb,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009359258",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/1/5.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Blas:2010:IEF,
  author =       "Javier Garcia Blas and Florin Isaila and Jesus
                 Carretero and David Singh and Felix
                 Garcia-Carballeira",
  title =        "Implementation and Evaluation of File Write-Back and
                 Prefetching for {MPI-IO} Over {GPFS}",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "1",
  pages =        "78--92",
  month =        feb,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009359015",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/1/78.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Brightwell:2010:EDA,
  author =       "Ron Brightwell",
  title =        "Exploiting Direct Access Shared Memory for {MPI} on
                 Multi-Core Processors",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "1",
  pages =        "69--77",
  month =        feb,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009359014",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/1/69.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Broquedis:2010:FEO,
  author =       "Fran{\c{c}}ois Broquedis and Nathalie Furmento and
                 Brice Goglin and Pierre-Andr{\'e} Wacrenier and Raymond
                 Namyst",
  title =        "{ForestGOMP}: An Efficient {OpenMP} Environment for
                 {NUMA} Architectures",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "38",
  number =       "5--6",
  pages =        "418--439",
  month =        oct,
  year =         "2010",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:49 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=38&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=38&issue=5&spage=418",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Bull:2010:PEM,
  author =       "J. Mark Bull and James Enright and Xu Guo and Chris
                 Maynard and Fiona Reid",
  title =        "Performance Evaluation of Mixed-Mode {OpenMP\slash
                 MPI} Implementations",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "38",
  number =       "5--6",
  pages =        "396--417",
  month =        oct,
  year =         "2010",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:49 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=38&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=38&issue=5&spage=396",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Campanoni:2010:HFP,
  author =       "Simone Campanoni and Giovanni Agosta and Stefano
                 Crespi Reghizzi and Andrea Di Biagio",
  title =        "A highly flexible, parallel virtual machine: design
                 and experience of {ILDJIT}",
  journal =      j-SPE,
  volume =       "40",
  number =       "2",
  pages =        "177--207",
  day =          "??",
  month =        feb,
  year =         "2010",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.950",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Wed Mar 17 10:16:22 MDT 2010",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Soft{\-}ware\emdash Prac{\-}tice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "Jan 14 2010 4:49AM",
}

@Article{Cardoso:2010:MSO,
  author =       "M. C. Cardoso and F. M. Costa",
  title =        "{MPI} support on opportunistic grids based on the
                 {InteGrade} middleware",
  journal =      j-CCPE,
  volume =       "22",
  number =       "3",
  pages =        "343--357",
  day =          "10",
  month =        mar,
  year =         "2010",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1479",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:41 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "21 Sep 2009",
}

@Article{Carter:2010:PLN,
  author =       "John D. Carter and William B. Gardner and Gary
                 Grewal",
  title =        "The {Pilot} library for novice {MPI} programmers",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "351--352",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The Pilot library is a new method for programming
                 MPI-enabled clusters in C, targeted at novice parallel
                 programmers. Formal elements from Communicating
                 Sequential Processes (CSP) are used to realize a
                 process/channel model of parallel computation that
                 reduces opportunities for deadlock and other
                 communication errors. This simple model, plus an
                 application programming inter-face (API) styled after
                 C's formatted I/O, are designed to make the library
                 easy to learn. The Pilot library exists as a thin layer
                 on top of any standard Message Passing Interface (MPI)
                 implementation, preserving MPI's portability and
                 efficiency, with little performance overhead arising as
                 result of Pilot's additional features.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "C; cluster programming; collective operations;
                 deadlock detection; high-performance computing; MPI",
}

@Article{Casas:2010:APD,
  author =       "Marc Casas and Rosa M. Badia and Jes{\'u}s Labarta",
  title =        "Automatic Phase Detection and Structure Extraction of
                 {MPI} Applications",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "3",
  pages =        "335--360",
  month =        aug,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009360039",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:46 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/3/335.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Cheng:2010:BRBb,
  author =       "Jie Cheng",
  title =        "Book Review: {{\booktitle{CUDA by Example: An
                 Introduction to General-Purpose GPU Programming}}, by
                 Jason Sanders and Edward Kandrot, ISBN-13
                 978-0-13-138768-3}",
  journal =      j-SCPE,
  volume =       "11",
  number =       "4",
  pages =        "401--401",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Sat Nov 10 09:03:30 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib;
                 http://www.scpe.org/index.php/scpe/issue/view/91",
  note =         "See \cite{Sanders:2010:CEI}.",
  URL =          "http://www.scpe.org/index.php/scpe/article/view/663",
  acknowledgement = ack-nhfb,
  remark =       "Special Issue: Network Management in Distributed
                 Systems.",
}

@Article{Cho:2010:OPP,
  author =       "S. M. Cho and D. W. Im and O. Y. Jang and H. J. Song
                 and B. D. Paulovicks and V. Sheinin and H. Yeo",
  title =        "{OpenCL} and parallel primitives for digital {TV}
                 applications",
  journal =      j-IBM-JRD,
  volume =       "54",
  number =       "5",
  pages =        "7:1--7:14",
  month =        "????",
  year =         "2010",
  CODEN =        "IBMJAE",
  DOI =          "https://doi.org/10.1147/JRD.2010.2062050",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Sun Feb 20 14:29:19 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.research.ibm.com/journal/",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Chou:2010:CMI,
  author =       "Yu-Cheng Chou and Stephen S. Nestinger and Harry H.
                 Cheng",
  title =        "{Ch MPI}: Interpretive Parallel Computing in {C}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "12",
  number =       "2",
  pages =        "54--67",
  month =        mar # "\slash " # apr,
  year =         "2010",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2010.36",
  ISSN =         "0740-7475 (print), 1558-1918 (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu May 13 11:08:14 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Dickens:2010:HPI,
  author =       "Phillip M. Dickens and Jeremy Logan",
  title =        "A high performance implementation of {MPI-IO} for a
                 {Lustre} file system environment",
  journal =      j-CCPE,
  volume =       "22",
  number =       "11",
  pages =        "1433--1449",
  day =          "10",
  month =        aug,
  year =         "2010",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1491",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:46 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "8 Sep 2009",
}

@TechReport{Du:2010:COT,
  author =       "Peng Du and Rick Weber and Piotr Luszczek and
                 Stanimire Tomov and Gregory Peterson and Jack
                 Dongarra",
  title =        "From {CUDA} to {OpenCL}: Towards a
                 Performance-portable Solution for Multi-platform {GPU}
                 Programming",
  type =         "LAPACK Working Note",
  number =       "228",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "6",
  month =        sep,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "UT-CS-10-656.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn228.pdf",
  acknowledgement = ack-nhfb,
}

@Article{FerreiradaSilva:2010:PBC,
  author =       "Adelino {Ferreira da Silva}",
  title =        "\pkg{cudaBayesreg}: {Bayesian} Computation in {CUDA}",
  journal =      j-R-JOURNAL,
  volume =       "2",
  number =       "2",
  pages =        "48--55",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  ISSN =         "2073-4859",
  bibdate =      "Thu Aug 13 15:54:57 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/rjournal.bib",
  URL =          "http://journal.r-project.org/archive/2010-2/RJournal_2010-2_Ferreira~da-Silva.pdf",
  acknowledgement = ack-r-project,
  fjournal =     "The R Journal",
  journal-URL =  "http://journal.r-project.org/",
}

@Article{Gelado:2010:ADS,
  author =       "Isaac Gelado and Javier Cabezas and Nacho Navarro and
                 John E. Stone and Sanjay Patel and Wen-mei W. Hwu",
  title =        "An asymmetric distributed shared memory model for
                 heterogeneous parallel systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "347--358",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736059",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/linux.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "Heterogeneous computing combines general purpose CPUs
                 with accelerators to efficiently execute both
                 sequential control-intensive and data-parallel phases
                 of applications. Existing programming models for
                 heterogeneous computing rely on programmers to
                 explicitly manage data transfers between the CPU system
                 memory and accelerator memory.\par

                 This paper presents a new programming model for
                 heterogeneous computing, called Asymmetric Distributed
                 Shared Memory (ADSM), that maintains a shared logical
                 memory space for CPUs to access objects in the
                 accelerator physical memory but not vice versa. The
                 asymmetry allows light-weight implementations that
                 avoid common pitfalls of symmetrical distributed shared
                 memory systems. ADSM allows programmers to assign data
                 objects to performance critical methods. When a method
                 is selected for accelerator execution, its associated
                 data objects are allocated within the shared logical
                 memory space, which is hosted in the accelerator
                 physical memory and transparently accessible by the
                 methods executed on CPUs.\par

                 We argue that ADSM reduces programming efforts for
                 heterogeneous computing systems and enhances
                 application portability. We present a software
                 implementation of ADSM, called GMAC, on top of CUDA in
                 a GNU/Linux environment. We show that applications
                 written in ADSM and running on top of GMAC achieve
                 performance comparable to their counterparts using
                 programmer-managed data transfers. This paper presents
                 the GMAC system and evaluates different design choices.
                 We further suggest additional architectural support
                 that will likely allow GMAC to achieve higher
                 application performance than the current CUDA model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "asymmetric distributed shared memory; data-centric
                 programming models; heterogeneous systems",
}

@Article{Granat:2010:PSS,
  author =       "Robert Granat and Bo Kagstrom",
  title =        "Parallel Solvers for {Sylvester}-Type Matrix Equations
                 with Applications in Condition Estimation, {Part I}:
                 Theory and Algorithms",
  journal =      j-TOMS,
  volume =       "37",
  number =       "3",
  pages =        "32:1--32:32",
  month =        sep,
  year =         "2010",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1824801.1824810",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Sep 27 10:15:50 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Parallel ScaLAPACK-style algorithms for solving eight
                 common standard and generalized Sylvester-type matrix
                 equations and various sign and transposed variants are
                 presented. All algorithms are blocked variants based on
                 the Bartels--Stewart method and involve four major
                 steps: reduction to triangular form, updating the
                 right-hand side with respect to the reduction,
                 computing the solution to the reduced triangular
                 problem, and transforming the solution back to the
                 original coordinate system. Novel parallel algorithms
                 for solving reduced triangular matrix equations based
                 on wavefront-like traversal of the right-hand side
                 matrices are presented together with a generic
                 scalability analysis. These algorithms are used in
                 condition estimation and new robust parallel sep$^{ -
                 1}$ -estimators are developed. Experimental results
                 from three parallel platforms, including results from a
                 mixed OpenMP/MPI platform, are presented and analyzed
                 using several performance and accuracy metrics. The
                 analysis includes results regarding general and
                 triangular parallel solvers as well as parallel
                 condition estimators.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Mathematical Software",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "condition estimation; Eigenvalue problems; library
                 software; Sylvester matrix equations",
}

@Article{Gutierrez:2010:QCS,
  author =       "Eladio Guti{\'e}rrez and Sergio Romero and Mar{\'\i}a
                 A. Trenas and Emilio L. Zapata",
  title =        "Quantum computer simulation using the {CUDA}
                 programming model",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "2",
  pages =        "283--300",
  month =        feb,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2009.09.021",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:27 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465509003117",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Hadjidoukas:2010:NOP,
  author =       "Panagiotis E. Hadjidoukas and Laurent Amsaleg",
  title =        "Nested {OpenMP} Parallelization of a Hierarchical Data
                 Clustering Algorithm",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "20",
  number =       "2",
  pages =        "187--208",
  month =        jun,
  year =         "2010",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626410000144",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:12 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Hamid:2010:CMB,
  author =       "Nor Asilah Wati Abdul Hamid and Paul Coddington",
  title =        "Comparison of {MPI} Benchmark Programs on Shared
                 Memory and Distributed Memory Machines (Point-to-Point
                 Communication)",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "4",
  pages =        "469--483",
  month =        nov,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342010371106",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Sep 6 15:14:35 MDT 2011",
  bibsource =    "http://hpc.sagepub.com/content/24/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/4/469.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "June 7, 2010",
}

@Article{Hawick:2010:PGC,
  author =       "K. A. Hawick and A. Leist and D. P. Playne",
  title =        "Parallel graph component labelling with {GPUs} and
                 {CUDA}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "36",
  number =       "12",
  pages =        "655--678",
  month =        dec,
  year =         "2010",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Nov 1 10:18:30 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Hong:2010:IGP,
  author =       "Sunpyo Hong and Hyesoon Kim",
  title =        "An integrated {GPU} power and performance model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "280--289",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815998",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPU architectures are increasingly important in the
                 multi-core era due to their high number of parallel
                 processors. Performance optimization for multi-core
                 processors has been a challenge for programmers.
                 Furthermore, optimizing for power consumption is even
                 more difficult. Unfortunately, as a result of the high
                 number of processors, the power consumption of
                 many-core processors such as GPUs has increased
                 significantly.\par

                 Hence, in this paper, we propose an integrated power
                 and performance (IPP) prediction model for a GPU
                 architecture to predict the optimal number of active
                 processors for a given application. The basic intuition
                 is that when an application reaches the peak memory
                 bandwidth, using more cores does not result in
                 performance improvement.\par

                 We develop an empirical power model for the GPU. Unlike
                 most previous models, which require measured execution
                 times, hardware performance counters, or architectural
                 simulations, IPP predicts execution times to calculate
                 dynamic power events. We then use the outcome of IPP to
                 control the number of running cores. We also model the
                 increases in power consumption that resulted from the
                 increases in temperature.\par

                 With the predicted optimal number of active cores, we
                 show that we can save up to 22.09\%of runtime GPU
                 energy consumption and on average 10.99\% of that for
                 the five memory bandwidth-limited benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "analytical model; CUDA; energy; GPU architecture;
                 performance; power estimation",
}

@Article{Huang:2010:ELA,
  author =       "Lei Huang and Haoqiang Jin and Liqi Yi and Barbara
                 Chapman",
  title =        "Enabling locality-aware computations in {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "18",
  number =       "3--4",
  pages =        "169--181",
  month =        "????",
  year =         "2010",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.3233/SPR-2010-0307",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Dec 13 19:01:33 MST 2011",
  bibsource =    "http://www.iospress.nl/journal/scientific-programming/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Isaila:2010:SMP,
  author =       "Florin Isaila and Francisco Javier Garcia Blas and
                 Jes{\'u}s Carretero and Wei-keng Liao and Alok
                 Choudhary",
  title =        "A Scalable {Message Passing Interface} Implementation
                 of an Ad-Hoc Parallel {I/O} system",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "2",
  pages =        "164--184",
  month =        may,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009347890",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:46 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/2/164.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Januszewski:2010:ANS,
  author =       "M. Januszewski and M. Kostur",
  title =        "Accelerating numerical solution of stochastic
                 differential equations with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "1",
  pages =        "183--188",
  month =        jan,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2009.09.009",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:27 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465509002999",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Jost:2010:EUH,
  author =       "Gabriele Jost and Bob Robins",
  title =        "Experiences using hybrid {MPI\slash OpenMP} in the
                 real world: Parallelization of a {$3$D} {CFD} solver
                 for multi-core node clusters",
  journal =      j-SCI-PROG,
  volume =       "18",
  number =       "3--4",
  pages =        "127--138",
  month =        "????",
  year =         "2010",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.3233/SPR-2010-0308",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Dec 13 19:01:33 MST 2011",
  bibsource =    "http://www.iospress.nl/journal/scientific-programming/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Kamal:2010:EIN,
  author =       "A. A. Kamal and A. M. Youssef",
  title =        "Enhanced implementation of the {NTRUEncrypt} algorithm
                 using graphics cards",
  crossref =     "Chaudhuri:2010:PIC",
  pages =        "168--174",
  year =         "2010",
  DOI =          "https://doi.org/10.1109/PDGC.2010.5679887",
  bibdate =      "Thu Apr 21 10:40:48 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The NTRU encryption algorithm, also known as
                 NTRUEncrypt, is a parameterized family of lattice-based
                 public key cryptosystems that has been accepted to the
                 IEEE P1363 standards under the specifications for
                 lattice-based public-key cryptography (IEEE P1363.1).
                 The operations of the NTRU encryption algorithm show
                 good characteristics for data parallel processing which
                 makes the NTRU a good candidate to benefit from the
                 high degree of parallelism available in modern graphics
                 processing units (GPUs). In this paper, we investigate
                 different GPU implementation options for the NTRU
                 encryption algorithm. Our implementation, on the NVIDIA
                 GTX275 GPU, using the CUDA framework, achieves about 77
                 MB/s for NTRU with the parameter set $ (N, q, p) =
                 (1171, 2048, 3) $.",
  acknowledgement = ack-nhfb,
  keywords =     "ANSI X9.98-2010; NTRUEncrypt",
}

@Article{Kapinos:2010:PPP,
  author =       "Paul Kapinos and Dieter an Mey",
  title =        "Productivity and Performance Portability of the
                 {OpenMP 3.0} Tasking Concept When Applied to an
                 Engineering Code Written in {Fortran 95}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "38",
  number =       "5--6",
  pages =        "379--395",
  month =        oct,
  year =         "2010",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:49 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=38&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=38&issue=5&spage=379",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Khanna:2010:NMG,
  author =       "Gaurav Khanna and Justin McKennon",
  title =        "Numerical modeling of gravitational wave sources
                 accelerated by {OpenCL}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "9",
  pages =        "1605--1611",
  month =        sep,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2010.05.014",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:30 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465510001682",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Book{Kirk:2010:PMP,
  author =       "David B. Kirk and Wen-mei W. Hwu",
  title =        "Programming Massively Parallel Processors: a Hands-on
                 Approach",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xviii + 258",
  year =         "2010",
  ISBN =         "0-12-381472-3",
  ISBN-13 =      "978-0-12-381472-2",
  LCCN =         "QA76.642 .K57 2010",
  bibdate =      "Thu Jul 29 13:33:50 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/master.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib;
                 z3950.bibsys.no:2100/BIBSYS;
                 z3950.loc.gov:7090/Voyager",
  note =         "Chapter 7 (pages 125--140) discusses GPU
                 floating-point considerations.",
  acknowledgement = ack-nhfb,
  keywords =     "CUDA; nVIDIA",
  libnote =      "Not yet in my library.",
  subject =      "parallel programming (computer science); parallel
                 processing (electronic computers); multiprocessors;
                 computer architecture",
  tableofcontents = "1: Introduction \\
                 1.1 GPUs as Parallel Computers \\
                 1.2 Architecture of a Modern GPU \\
                 1.3 Why More Speed or Parallelism? \\
                 1.4 Parallel Programming Languages and Models \\
                 1.5 Overarching Goals \\
                 1.6 Organization of the Book \\
                 2: History of GPU Computing \\
                 2.1. Evolution of Graphics Pipelines The Era of Fixed
                 Function Graphics Pipeline Evolution of Programmable
                 Real-Time Graphics Unified Graphics and Computing
                 Processors \\
                 2.2. GPGPU: an Intermediate Step Scalable GPUs Recent
                 Developments Future Trends \\
                 3: Introduction to CUDA \\
                 3.1. Data Parallelism \\
                 3.2. CUDA Program Structure \\
                 3.3. A Matrix--Matrix Multiplication Example \\
                 3.4. Device Memories and Data Transfer \\
                 3.5. Kernel Functions and Threading \\
                 3.6. Summary Function Declarations Kernel Launch
                 Predefined Variables Runtime API \\
                 4: CUDA Threads \\
                 4.1. CUDA Thread Organization \\
                 4.2. More on BlockIdx and ThreadIdx \\
                 4.3. Synchronization and Transparent Scalability \\
                 4.4. Thread Assignment \\
                 4.5. Thread Scheduling and Latency Tolerance \\
                 4.6. Summary \\
                 5: CUDA Memories \\
                 5.1. Importance of Memory Access Efficiency \\
                 5.2. CUDA Device Memory Types \\
                 5.3. A Strategy for Reducing Global Memory Traffic \\
                 5.4. Memory as a Limiting Factor to Parallelism \\
                 5.5. Summary \\
                 6: Performance Considerations \\
                 6.1. More on Thread Execution \\
                 6.2. Global Memory Bandwidth \\
                 6.3. Dynamic Partitioning of SM Resources \\
                 6.4. Data Prefetching \\
                 6.5. Instruction Mix \\
                 6.6. Thread Granularity \\
                 6.7. Measured Performance and Summary \\
                 \\
                 7: Floating-Point Considerations \\
                 7.1. Floating-Point Format Normalized representation of
                 M Excess encoding of E \\
                 7.2. Representable Numbers \\
                 7.3. Special Bit Patterns and Precision \\
                 7.4. Arithmetic Accuracy and Rounding \\
                 7.5. Algorithm Considerations \\
                 7.6. Summary \\
                 8: Application Case Study I \\
                 Advanced MRI Reconstruction \\
                 8.1. Application Background \\
                 8.2. Iterative Reconstruction \\
                 8.3. Computing FHd \\
                 Step 1: Determine the Kernel Parallelism Structure \\
                 Step 2: Getting Around the Memory Bandwidth Limitation
                 \\
                 Step 3: Use Hardware Trigonometry Functions \\
                 Step 4: Experimental Performance Testing \\
                 8.4. Final Evaluation \\
                 9: Application Case Study II \\
                 Molecular Visualization and Analysis \\
                 9.1. Application Background \\
                 9.2. A Simple Kernel Implementation \\
                 9.3. Instruction Execution Efficiency \\
                 9.4. Memory Coalescing \\
                 9.5. Additional Performance Comparisons \\
                 9.6. Using Multiple GPUs \\
                 10: Parallel Programming and Computational Thinking \\
                 10.1. Goals of Parallel Programming \\
                 10.2. Problem Decomposition \\
                 10.3. Algorithm Selection \\
                 10.4. Computational Thinking \\
                 11: A Brief Introduction to OpenCL? \\
                 11.1. Background \\
                 11.2. Data Parallelism Model \\
                 11.3. Device Architecture \\
                 11.4. Kernel Functions \\
                 11.5. Device Management and Kernel Launch \\
                 11.6. Electrostatic Potential Map in OpenCL \\
                 11.7. Summary \\
                 12: Conclusion and Future Outlook \\
                 12.1. Goals Revisited \\
                 12.2. Memory Architecture Evolution \\
                 12.3. Kernel Execution Control Evolution \\
                 12.4. Core Performance \\
                 12.5. Programming Environment \\
                 12.6. A Bright Outlook \\
                 Appendix A: Matrix Multiplication Example Code \\
                 Appendix B: Speed and feed of current generation CUDA
                 devices",
}

@Article{Komatitsch:2010:HOF,
  author =       "Dimitri Komatitsch and Gordon Erlebacher and Dominik
                 G{\"o}ddeke and David Mich{\'e}a",
  title =        "High-order finite-element seismic wave propagation
                 modeling with {MPI} on a large {GPU} cluster",
  journal =      j-J-COMPUT-PHYS,
  volume =       "229",
  number =       "20",
  pages =        "7692--7714",
  day =          "1",
  month =        oct,
  year =         "2010",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1016/j.jcp.2010.06.024",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Sat Dec 31 11:58:42 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999110003396",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Koval:2010:USB,
  author =       "Peter Koval and J. D. Talman",
  title =        "Update of spherical {Bessel} transform: {FFTW} and
                 {OpenMP}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "12",
  pages =        "2212--2213",
  month =        dec,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2010.08.024",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:31 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465510003188",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Kwon:2010:SPC,
  author =       "Seongnam Kwon and Soonhoi Ha",
  title =        "Serialized parallel code generation framework for
                 {MPSoC}",
  journal =      j-TODAES,
  volume =       "15",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2010",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1698759.1698761",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Mon Mar 15 11:19:08 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The models of computations that express concurrency
                 naturally are preferred for initial specification of
                 MPSoC system, since popular programming languages such
                 as C and C++ are designed for sequential execution. In
                 our previous work, we proposed a design framework where
                 two models are used for the initial specification of
                 the system behavior; task model at the top level and
                 dataflow model inside each task. After the partition
                 and mapping process is performed with each architecture
                 candidate, the target code is automatically generated
                 for both Design-Space Exploration (DSE) and final
                 implementation. In this article, we focus on parallel
                 code generation for MPSoC, proposing two main
                 techniques. The first is to express functional and data
                 parallelism differently following the partition and
                 mapping decision. In the proposed technique, the
                 generated code consists of multiple tasks running
                 concurrently, which achieves functional parallelism. On
                 the other hand, we use OpenMP directives to express
                 data parallelism inside a task. Second is to adopt the
                 code serialization technique to execute a multitasking
                 application without OS scheduler, aiming to generate
                 the highly portable code on various platforms for an
                 efficient DSE process. We extend the previous code
                 serialization techniques to multiprocessor systems and
                 utilize the formal properties of the dataflow model for
                 efficient code generation. The experiments including
                 H.263 codec example show the viability of the proposed
                 technique and the efficiency of the generated code.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems (TODAES)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
  keywords =     "design-space exploration; Embedded software;
                 multiprocessor system on chip; parallel programming;
                 software generation",
}

@Article{Lastovetsky:2010:RAP,
  author =       "Alexey Lastovetsky and Tahar Kechadi",
  title =        "Recent Advances in {Parallel Virtual Machine} and
                 {Message Passing Interface}",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "1",
  pages =        "3--4",
  month =        feb,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009359523",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/1/3.full.pdf+html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Li:2010:SVC,
  author =       "Guodong Li and Ganesh Gopalakrishnan and Robert M.
                 Kirby and Dan Quinlan",
  title =        "A symbolic verifier for {CUDA} programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "357--358",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a preliminary automated verifier based on
                 mechanical decision procedures which is able to prove
                 functional correctness of CUDA programs and guarantee
                 to detect bugs such as race conditions. We also employ
                 a symbolic partial order reduction (POR) technique to
                 mitigate the interleaving explosion problem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cuda; formal verification; SPMD; symbolic analysis",
}

@Article{Lin:2010:TLS,
  author =       "Paul T. Lin and John N. Shadid",
  title =        "Towards large-scale multi-socket, multicore parallel
                 simulations: Performance of an {MPI}-only semiconductor
                 device simulator",
  journal =      j-J-COMPUT-PHYS,
  volume =       "229",
  number =       "19",
  pages =        "6804--6818",
  day =          "20",
  month =        sep,
  year =         "2010",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1016/j.jcp.2010.05.023",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Sat Dec 31 11:58:37 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999110002846",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Liu:2010:RTC,
  author =       "Fuchang Liu and Takahiro Harada and Youngeun Lee and
                 Young J. Kim",
  title =        "Real-time collision culling of a million bodies on
                 graphics processing units",
  journal =      j-TOG,
  volume =       "29",
  number =       "6",
  pages =        "154:1--154:??",
  month =        dec,
  year =         "2010",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/1882261.1866180",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Thu Dec 9 11:41:01 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "We cull collisions between very large numbers of
                 moving bodies using graphics processing units (GPUs).
                 To perform massively parallel sweep-and-prune (SaP), we
                 mitigate the great density of intervals along the axis
                 of sweep by using principal component analysis to
                 choose the best sweep direction, together with spatial
                 subdivisions to further reduce the number of false
                 positive overlaps. Our algorithm implemented entirely
                 on GPUs using the CUDA framework can handle a million
                 moving objects at interactive rates. As application of
                 our algorithm, we demonstrate the real-time simulation
                 of very large numbers of particles and rigid-body
                 dynamics.",
  acknowledgement = ack-nhfb,
  articleno =    "154",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Marjanovic:2010:ECC,
  author =       "Vladimir Marjanovic and Jes{\'u}s Labarta and Eduard
                 Ayguad{\'e} and Mateo Valero",
  title =        "Effective communication and computation overlap with
                 hybrid {MPI\slash SMPSs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "337--338",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Communication overhead is one of the dominant factors
                 affecting performance in high-performance computing
                 systems. To reduce the negative impact of
                 communication, programmers overlap communication and
                 computation by using asynchronous communication
                 primitives. This increases code complexity, requiring
                 more development effort and making less readable
                 programs. This paper presents the hybrid use of MPI and
                 SMPSs (SMP superscalar, a task-based shared-memory
                 programming model) that allows the programmer to easily
                 introduce the asynchrony necessary to overlap
                 communication and computation. We demonstrate the
                 hybrid use of MPI/SMPSs with the high-performance
                 LINPACK benchmark (HPL), and compare it to the pure MPI
                 implementation, which uses the look-ahead technique to
                 overlap communication and computation. The hybrid
                 MPI/SMPSs version significantly improves the
                 performance of the pure MPI version, getting close to
                 the asymptotic performance at medium problem sizes and
                 still getting significant benefits at small/large
                 problem sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "hybrid MPI/SMPSs; LINPACK; MPI; parallel programming
                 model",
}

@Article{Molnar:2010:APM,
  author =       "F. {Moln{\'a}r, Jr.} and T. Szak{\'a}ly and R.
                 M{\'e}sz{\'a}ros and I. Lagzi",
  title =        "Air pollution modelling using a {Graphics Processing
                 Unit} with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "1",
  pages =        "105--112",
  month =        jan,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2009.09.008",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:27 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465509002872",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Muller:2010:SMA,
  author =       "Matthias S. M{\"u}ller and Matthijs van Waveren and
                 Ron Lieberman and Brian Whitney and Hideki Saito and
                 Kalyan Kumaran and John Baron and William C. Brantley
                 and Chris Parrott and Tom Elken and Huiyu Feng and Carl
                 Ponder",
  title =        "{SPEC MPI2007} --- an application benchmark suite for
                 parallel systems using {MPI}",
  journal =      j-CCPE,
  volume =       "22",
  number =       "2",
  pages =        "191--205",
  month =        feb,
  year =         "2010",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1535",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:41 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "2 Dec 2009",
}

@Article{Nesterov:2010:SPT,
  author =       "Oleksandr Nesterov",
  title =        "A simple parallelization technique with {MPI} for
                 ocean circulation models",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "70",
  number =       "1",
  pages =        "35--44",
  month =        jan,
  year =         "2010",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 16:27:27 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Nunez:2010:NTS,
  author =       "Alberto N{\'u}{\~n}ez and Javier Fern{\'a}ndez and
                 Jose D. Garcia and F{\'e}lix Garcia and Jes{\'u}s
                 Carretero",
  title =        "New techniques for simulating high performance {MPI}
                 applications on large storage networks",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "51",
  number =       "1",
  pages =        "40--57",
  month =        jan,
  year =         "2010",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Aug 25 08:38:45 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=51&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=51&issue=1&spage=40",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Okitsu:2010:HPC,
  author =       "Yusuke Okitsu and Fumihiko Ino and Kenichi Hagihara",
  title =        "High-performance cone beam reconstruction using {CUDA}
                 compatible {GPUs}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "36",
  number =       "2--3",
  pages =        "129--141",
  month =        feb # "\slash " # mar,
  year =         "2010",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:12 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Olivier:2010:COO,
  author =       "Stephen L. Olivier and Jan F. Prins",
  title =        "Comparison of {OpenMP 3.0} and Other Task Parallel
                 Frameworks on Unbalanced Task Graphs",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "38",
  number =       "5--6",
  pages =        "341--360",
  month =        oct,
  year =         "2010",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:49 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=38&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=38&issue=5&spage=341",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Pan:2010:CPS,
  author =       "Heidi Pan and Benjamin Hindman and Krste
                 Asanovi{\'c}",
  title =        "Composing parallel software efficiently with {Lithe}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "376--387",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806639",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Applications composed of multiple parallel libraries
                 perform poorly when those libraries interfere with one
                 another by obliviously using the same physical cores,
                 leading to destructive resource oversubscription. This
                 paper presents the design and implementation of {\em
                 Lithe}, a low-level substrate that provides the basic
                 primitives and a standard interface for composing
                 parallel codes efficiently. Lithe can be inserted
                 underneath the runtimes of legacy parallel libraries to
                 provide {\em bolt-on\/} composability without needing
                 to change existing application code. Lithe can also
                 serve as the foundation for building new parallel
                 abstractions and libraries that automatically
                 interoperate with one another.\par

                 In this paper, we show versions of Threading Building
                 Blocks (TBB) and OpenMP perform competitively with
                 their original implementations when ported to Lithe.
                 Furthermore, for two applications composed of multiple
                 parallel libraries, we show that leveraging our
                 substrate outperforms their original, even expertly
                 tuned, implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "composability; cooperative scheduling; hierarchical
                 scheduling; oversubscription; parallelism; resource
                 management; user-level scheduling",
}

@Article{Pervez:2010:FMA,
  author =       "Salman Pervez and Ganesh Gopalakrishnan and Robert M.
                 Kirby and Rajeev Thakur and William Gropp",
  title =        "Formal methods applied to high-performance computing
                 software design: a case study of {MPI} one-sided
                 communication-based locking",
  journal =      j-SPE,
  volume =       "40",
  number =       "1",
  pages =        "23--43",
  day =          "??",
  month =        jan,
  year =         "2010",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.946",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Wed Mar 17 10:16:21 MDT 2010",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Soft{\-}ware\emdash Prac{\-}tice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "Dec 21 2009 3:42AM",
}

@Article{Preissl:2010:OCC,
  author =       "Robert Preissl and Alice Koniges and Stephan Ethier
                 and Weixing Wang and Nathan Wichmann",
  title =        "Overlapping communication with computation using
                 {OpenMP} tasks on the {GTS} magnetic fusion code",
  journal =      j-SCI-PROG,
  volume =       "18",
  number =       "3--4",
  pages =        "139--151",
  month =        "????",
  year =         "2010",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.3233/SPR-2010-0311",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Dec 13 19:01:33 MST 2011",
  bibsource =    "http://www.iospress.nl/journal/scientific-programming/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Preissl:2010:TMS,
  author =       "Robert Preissl and Martin Schulz and Dieter
                 Kranzlm{\"u}ller and Bronis R. de Supinski and Daniel
                 J. Quinlan",
  title =        "Transforming {MPI} source code based on communication
                 patterns",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "26",
  number =       "1",
  pages =        "147--154",
  month =        jan,
  year =         "2010",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Sep 11 13:08:16 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Sainio:2010:CGA,
  author =       "J. Sainio",
  title =        "{CUDAEASY} --- a {GPU} accelerated cosmological
                 lattice program",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "5",
  pages =        "906--912",
  month =        may,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2010.01.002",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:29 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465510000159",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Saldana:2010:MPM,
  author =       "Manuel Salda{\~n}a and Arun Patel and Christopher
                 Madill and Daniel Nunes and Danyao Wang and Paul Chow
                 and Ralph Wittig and Henry Styles and Andrew Putnam",
  title =        "{MPI} as a Programming Model for High-Performance
                 Reconfigurable Computers",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "22:1--22:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862652",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Book{Sanders:2010:CEI,
  author =       "Jason Sanders and Edward Kandrot",
  title =        "{CUDA} by Example: an Introduction to General-purpose
                 {GPU} Programming",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "xix + 290",
  year =         "2010",
  ISBN =         "0-13-138768-5",
  ISBN-13 =      "978-0-13-138768-3",
  LCCN =         "QA76.76.A65",
  bibdate =      "Wed Jul 28 23:24:12 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/master.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib;
                 z3950.gbv.de:20011/gvk",
  abstract =     "CUDA is a computing architecture designed to
                 facilitate the development of parallel programs. This
                 book shows programmers how to employ this new
                 technology. Each area of CUDA development is introduced
                 through working examples. After a concise introduction
                 to the CUDA platform and architecture, as well as a
                 quick-start guide to CUDA C, the book details the
                 techniques and trade-offs associated with each key CUDA
                 feature.",
  acknowledgement = ack-nhfb,
  keywords =     "CUDA; GPU",
  subject =      "application software; development; computer
                 architecture; parallel programming (computer science)",
  tableofcontents = "Why CUDA? why now? \\
                 Getting started \\
                 Introduction to CUDA C \\
                 Parallel programming in CUDA C \\
                 Thread cooperation \\
                 Constant memory and events \\
                 Texture memory \\
                 Graphics interoperability \\
                 Atomics \\
                 Streams \\
                 CUDA C on multiple GPUs \\
                 The final countdown \\
                 Appendix A: Advanced atomics",
}

@Article{Sandes:2010:CUG,
  author =       "Edans Flavius O. Sandes and Alba Cristina M. A. de
                 Melo",
  title =        "{CUDAlign}: using {GPU} to accelerate the comparison
                 of megabase genomic sequences",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "137--146",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693473",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Biological sequence comparison is a very important
                 operation in Bioinformatics. Even though there do exist
                 exact methods to compare biological sequences, these
                 methods are often neglected due to their quadratic time
                 and space complexity. In order to accelerate these
                 methods, many GPU algorithms were proposed in the
                 literature. Nevertheless, all of them restrict the size
                 of the smallest sequence in such a way that Megabase
                 genome comparison is prevented. In this paper, we
                 propose and evaluate CUDAlign, a GPU algorithm that is
                 able to compare Megabase biological sequences with an
                 exact Smith--Waterman affine gap variant. CUDAlign was
                 implemented in CUDA and tested in two GPU boards,
                 separately. For real sequences whose size range from
                 1MBP (Megabase Pairs) to 47MBP, a close to uniform
                 GCUPS (Giga Cells Updates per Second) was obtained,
                 showing the potential scalability of our approach.
                 Also, CUDAlign was able to compare the human chromosome
                 21 and the chimpanzee chromosome 22. This operation
                 took 21 hours on GeForce GTX 280, resulting in a peak
                 performance of 20.375 GCUPS. As far as we know, this is
                 the first time such huge chromosomes are compared with
                 an exact method.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "biological sequence comparison; GPU; Smith--Waterman",
}

@Article{Segovia:2010:PPN,
  author =       "Alejandro Segovia",
  title =        "Parallel programming with {NVIDIA CUDA}",
  journal =      j-LINUX-J,
  volume =       "2010",
  number =       "200",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2010",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Mon Jan 10 10:01:27 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/linux-journal.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Linux Journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Shi:2010:PAE,
  author =       "Haixiang Shi and Bertil Schmidt and Weiguo Liu and
                 Wolfgang M{\"u}ller-Wittig",
  title =        "A Parallel Algorithm for Error Correction in
                 High-Throughput Short-Read Data on {CUDA}-Enabled
                 Graphics Hardware",
  journal =      j-J-COMPUT-BIOL,
  volume =       "17",
  number =       "4",
  pages =        "603--615",
  month =        apr,
  year =         "2010",
  CODEN =        "JCOBEM",
  DOI =          "https://doi.org/10.1089/cmb.2009.0062",
  ISSN =         "1066-5277 (print), 1557-8666 (electronic)",
  ISSN-L =       "1066-5277",
  bibdate =      "Sat Jun 1 09:49:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputbiol.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.liebertpub.com/doi/abs/10.1089/cmb.2009.0062;
                 https://www.liebertpub.com/doi/pdf/10.1089/cmb.2009.0062",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Biology",
  journal-URL =  "https://www.liebertpub.com/loi/cmb/",
  onlinedate =   "28 April 2010",
}

@Article{Stone:2010:OPP,
  author =       "John E. Stone and David Gohara and Guochun Shi",
  title =        "{OpenCL}: a Parallel Programming Standard for
                 Heterogeneous Computing Systems",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "12",
  number =       "3",
  pages =        "66--73",
  month =        may # "\slash " # jun,
  year =         "2010",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2010.69",
  ISSN =         "0740-7475 (print), 1558-1918 (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu May 13 11:08:14 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@InProceedings{Suciu:2010:PIN,
  author =       "A. Suciu and I. Nagy and K. Marton and I. Pinca",
  editor =       "Ioan Alfred Letia",
  booktitle =    "{Proceedings, 2010 IEEE 6th International Conference
                 on Intelligent Computer Communication and Processing:
                 Cluj-Napoca, Romania, August 26--28, 2010}",
  title =        "Parallel implementation of the {NIST Statistical Test
                 Suite}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  bookpages =    "xiii + 487",
  pages =        "363--368",
  year =         "2010",
  DOI =          "https://doi.org/10.1109/ICCP.2010.5606412",
  ISBN =         "1-4244-8228-3 (print), 1-4244-8230-5 (electronic)",
  ISBN-13 =      "978-1-4244-8228-3 (print), 978-1-4244-8230-6
                 (electronic)",
  LCCN =         "QA76.76.E95",
  bibdate =      "Tue Jan 31 14:22:16 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number CFP1009D-ART.",
  URL =          "http://ieeexplore.ieee.org/servlet/opac?punumber=5598248",
  acknowledgement = ack-nhfb,
  keywords =     "OpenMP API",
  onlinedate =   "21 October 2010",
  remark =       "From the abstract: ``Experimental results show a very
                 significant speedup of up to 103 times compared to the
                 original version.''",
}

@Article{Traff:2010:SCM,
  author =       "Jesper Larsson Traff and William D. Gropp and Rajeev
                 Thakur",
  title =        "Self-Consistent {MPI} Performance Guidelines",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "21",
  number =       "5",
  pages =        "698--709",
  month =        may,
  year =         "2010",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2009.120",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu May 13 12:06:56 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Tzannes:2010:LBS,
  author =       "Alexandros Tzannes and George C. Caragea and Rajeev
                 Barua and Uzi Vishkin",
  title =        "Lazy binary-splitting: a run-time adaptive
                 work-stealing scheduler",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "179--190",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693479",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We present Lazy Binary Splitting (LBS), a user-level
                 scheduler of nested parallelism for shared-memory
                 multiprocessors that builds on existing Eager Binary
                 Splitting work-stealing (EBS) implemented in Intel's
                 Threading Building Blocks (TBB), but improves
                 performance and ease-of-programming. In its simplest
                 form (SP), EBS requires manual tuning by repeatedly
                 running the application under carefully controlled
                 conditions to determine a {\em stop-splitting-threshold
                 (sst)\/} for every do-all loop in the code. This
                 threshold limits the parallelism and prevents excessive
                 overheads for fine-grain parallelism. Besides being
                 tedious, this tuning also over-fits the code to some
                 particular dataset, platform and calling context of the
                 do-all loop, resulting in poor performance portability
                 for the code. LBS overcomes both the performance
                 portability and ease-of-programming pitfalls of a
                 manually fixed threshold by adapting dynamically to
                 run-time conditions without requiring tuning.\par

                 We compare LBS to Auto-Partitioner (AP), the latest
                 default scheduler of TBB, which does not require manual
                 tuning either but lacks context portability, and
                 outperform it by 38.9\% using TBB's default AP
                 configuration, and by 16.2\% after we tuned AP to our
                 experimental platform. We also compare LBS to SP by
                 manually finding SP's sst using a training dataset and
                 then running both on a different execution dataset. LBS
                 outperforms SP by 19.5\% on average. while allowing for
                 improved performance portability without requiring
                 tedious manual tuning. LBS also outperforms SP with
                 {\em sst=1}, its default value when undefined, by
                 56.7\%, and serializing work-stealing (SWS), another
                 work-stealer by 54.7\%. Finally, compared to
                 serializing inner parallelism (SI) which has been used
                 by OpenMP, LBS is 54.2\% faster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIG{\-}PLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "dynamic scheduling; load balancing; nested
                 parallelism; thread scheduling; work stealing",
}

@Article{Wendykier:2010:PCH,
  author =       "Piotr Wendykier and James G. Nagy",
  title =        "{Parallel Colt}: a High-Performance {Java} Library for
                 Scientific Computing and Image Processing",
  journal =      j-TOMS,
  volume =       "37",
  number =       "3",
  pages =        "31:1--31:22",
  month =        sep,
  year =         "2010",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1824801.1824809",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Sep 27 10:15:50 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "Major breakthroughs in chip and software design have
                 been observed for the last nine years. In October 2001,
                 IBM released the world's first multicore processor:
                 POWER4. Six years later, in February 2007, NVIDIA made
                 a public release of CUDA SDK, a set of development
                 tools to write algorithms for execution on Graphic
                 Processing Units (GPUs). Although software vendors have
                 started working on parallelizing their products, the
                 vast majority of existing code is still sequential and
                 does not effectively utilize modern multicore CPUs and
                 manycore GPUs.\par

                 This article describes Parallel Colt, a multithreaded
                 Java library for scientific computing and image
                 processing. In addition to describing the design and
                 functionality of Parallel Colt, a comparison to MATLAB
                 is presented. Two ImageJ plugins for iterative image
                 deblurring and motion correction of PET brain images
                 are described as typical applications of this library.
                 Performance comparisons with MATLAB, including GPU
                 computations via AccelerEyes' Jacket toolbox are also
                 given.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "Deconvolution; FFT; inverse problems; iterative
                 methods; motion correction; multithreading; PET;
                 regularization",
}

@InProceedings{Zhao:2010:GMP,
  author =       "Kaiyong Zhao and Xiaowen Chu",
  editor =       "{IEEE}",
  booktitle =    "{IEEE 10th International Conference on Computer and
                 Information Technology (CIT), 2010: June 29, 2010--July
                 1, 2010, Bradford, West Yorkshire, UK}",
  title =        "{GPUMP}: a Multiple-Precision Integer Library for
                 {GPUs}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  bookpages =    "xcix + 2987 (est.)",
  pages =        "1164--1168",
  year =         "2010",
  DOI =          "https://doi.org/10.1109/CIT.2010.211",
  ISBN =         "0-7695-4108-9 (print), 1-4244-7547-3",
  ISBN-13 =      "978-0-7695-4108-2 (print), 978-1-4244-7547-6",
  LCCN =         "????",
  bibdate =      "Thu Jan 16 10:33:01 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE Computer Society Order Number E4108. BMS Part
                 Number: CFP10355-CDR",
  acknowledgement = ack-nhfb,
  book-URL =     "http://ieeexplore.ieee.org/servlet/opac?punumber=5575291",
  keywords =     "CUDA; GPU; multiple-precision algorithm;
                 multiple-precision comparison; multiple-precision
                 division; multiple-precision exponentiation;
                 multiple-precision modular addition; multiple-precision
                 modular multiplication; multiple-precision Montgomery
                 exponentiation; multiple-precision Montgomery
                 multiplication; multiple-precision Montgomery
                 reduction; multiple-precision multiplication; nVidia
                 GT200 GPU",
}

@Article{Agrawal:2011:PPS,
  author =       "Ankit Agrawal and Sanchit Misra and Daniel Honbo and
                 Alok Choudhary",
  title =        "Parallel pairwise statistical significance estimation
                 of local sequence alignment using {Message Passing
                 Interface} library",
  journal =      j-CCPE,
  volume =       "23",
  number =       "17",
  pages =        "2269--2279",
  day =          "10",
  month =        dec,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1798",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:09:00 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "7 Jul 2011",
}

@Article{Agullo:2011:QOM,
  author =       "Emmanuel Agullo and Camille Coti and Thomas Herault
                 and Julien Langou and Sylvain Peyronnet and Ala
                 Rezmerita and Franck Cappello and Jack Dongarra",
  title =        "{QCG-OMPI}: {MPI} applications on grids",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "27",
  number =       "4",
  pages =        "357--369",
  month =        apr,
  year =         "2011",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Tue Aug 30 11:43:29 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Alonso:2011:NEM,
  author =       "P. Alonso and R. Cortina and F. J.
                 Mart{\'\i}nez-Zald{\'\i}var and J. Ranilla",
  title =        "{Neville} elimination on multi- and many-core systems:
                 {OpenMP}, {MPI} and {CUDA}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "58",
  number =       "2",
  pages =        "215--225",
  month =        nov,
  year =         "2011",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 13 15:25:06 MST 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=58&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=58&issue=2&spage=215",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Balaji:2011:MMC,
  author =       "Pavan Balaji and Darius Buntinas and David Goodell and
                 William Gropp and Torsten Hoefler and Sameer Kumar and
                 Ewing Lusk and Rajeev Thakur and Jesper Larsson
                 Tr{\"a}ff",
  title =        "{MPI} on Millions of Cores",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "21",
  number =       "1",
  pages =        "45--60",
  month =        mar,
  year =         "2011",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626411000060",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Tue Feb 28 11:32:06 MST 2012",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Balevic:2011:KAD,
  author =       "Ana Balevic and Bart Kienhuis",
  title =        "{KPN2GPU}: an approach for discovery and exploitation
                 of fine-grain data parallelism in process networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "66--71",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082173",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With advances in manycore and accelerator
                 architectures, the high performance and embedded spaces
                 are rapidly converging. Emerging architectures feature
                 different forms of parallelism. The Polyhedral
                 Processes Networks (PPNs) are a proven model of choice
                 for automated generation of pipeline and task parallel
                 programs from sequential source code, however data
                 parallelism is not addressed. In this paper, we present
                 a systematic approach for identification and extraction
                 of fine grain data parallelism from the PPN
                 specification. The approach is implemented in a tool,
                 called kpn2gpu, which produces fine-grain data parallel
                 CUDA kernels for graphics processing units (GPUs).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Bhattacharjee:2011:PLC,
  author =       "Abhishek Bhattacharjee and Gilberto Contreras and
                 Margaret Martonosi",
  title =        "Parallelization libraries: Characterizing and reducing
                 overheads",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1952998.1953003",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Creating efficient, scalable dynamic parallel runtime
                 systems for chip multiprocessors (CMPs) requires
                 understanding the overheads that manifest at high core
                 counts and small task sizes. In this article, we assess
                 these overheads on Intel's Threading Building Blocks
                 (TBB) and OpenMP. First, we use real hardware and
                 simulations to detail various scheduler and
                 synchronization overheads. We find that these can
                 amount to 47\% of TBB benchmark runtime and 80\% of
                 OpenMP benchmark runtime. Second, we propose load
                 balancing techniques such as occupancy-based and
                 criticality-guided task stealing, to boost performance.
                 Overall, our study provides valuable insights for
                 creating robust, scalable runtime libraries.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Camp:2011:SIU,
  author =       "David Camp and Christoph Garth and Hank Childs and
                 Dave Pugmire and Kenneth I. Joy",
  title =        "Streamline Integration Using {MPI}-Hybrid Parallelism
                 on a Large Multicore Architecture",
  journal =      j-IEEE-TRANS-VIS-COMPUT-GRAPH,
  volume =       "17",
  number =       "11",
  pages =        "1702--1713",
  month =        nov,
  year =         "2011",
  CODEN =        "ITVGEA",
  DOI =          "https://doi.org/10.1109/TVCG.2010.259",
  ISSN =         "1077-2626 (print), 1941-0506 (electronic), 2160-9306",
  ISSN-L =       "1077-2626",
  bibdate =      "Thu Sep 29 11:52:46 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Visualization and Computer
                 Graphics",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
}

@Article{Cao:2011:OMM,
  author =       "Chao Cao and Yun-wen Chen and Yuning Wu and Erik
                 Deumens and Hai-Ping Cheng",
  title =        "{OPAL}: a multiscale multicenter simulation package
                 based on {MPI-2} protocol",
  journal =      j-IJQC,
  volume =       "111",
  number =       "15",
  pages =        "4020--4029",
  month =        dec,
  year =         "2011",
  CODEN =        "IJQCB2",
  DOI =          "https://doi.org/10.1002/qua.22916",
  ISSN =         "0020-7608 (print), 1097-461X (electronic)",
  ISSN-L =       "0020-7608",
  bibdate =      "Sat Oct 1 15:40:12 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijqc2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Int. J. Quantum Chem.",
  fjournal =     "International Journal of Quantum Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0020-7608/",
  onlinedate =   "23 Nov 2010",
}

@Article{Catanzaro:2011:CCE,
  author =       "Bryan Catanzaro and Michael Garland and Kurt Keutzer",
  title =        "{Copperhead}: compiling an embedded data parallel
                 language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "47--56",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Modern parallel microprocessors deliver high
                 performance on applications that expose substantial
                 fine-grained data parallelism. Although data
                 parallelism is widely available in many computations,
                 implementing data parallel algorithms in low-level
                 languages is often an unnecessarily difficult task. The
                 characteristics of parallel microprocessors and the
                 limitations of current programming methodologies
                 motivate our design of Copperhead, a high-level data
                 parallel language embedded in Python. The Copperhead
                 programmer describes parallel computations via
                 composition of familiar data parallel primitives
                 supporting both flat and nested data parallel
                 computation on arrays of data. Copperhead programs are
                 expressed in a subset of the widely used Python
                 programming language and interoperate with standard
                 Python modules, including libraries for numeric
                 computation, data visualization, and analysis. In this
                 paper, we discuss the language, compiler, and runtime
                 features that enable Copperhead to efficiently execute
                 data parallel code. We define the restricted subset of
                 Python which Copperhead supports and introduce the
                 program analysis techniques necessary for compiling
                 Copperhead code into efficient low-level
                 implementations. We also outline the runtime support by
                 which Copperhead programs interoperate with standard
                 Python modules. We demonstrate the effectiveness of our
                 techniques with several examples targeting the CUDA
                 platform for parallel programming on GPUs. Copperhead
                 code is concise, on average requiring 3.6 times fewer
                 lines of code than CUDA, and the compiler generates
                 efficient code, yielding 45-100\% of the performance of
                 hand-crafted, well optimized CUDA code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Chalkidis:2011:HPH,
  author =       "Georgios Chalkidis and Masao Nagasaki and Satoru
                 Miyano",
  title =        "High Performance Hybrid Functional {Petri} Net
                 Simulations of Biological Pathway Models on {CUDA}",
  journal =      j-TCBB,
  volume =       "8",
  number =       "6",
  pages =        "1545--1556",
  month =        nov,
  year =         "2011",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2010.118",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Sun Nov 6 06:45:50 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Czapinski:2011:TST,
  author =       "Michal Czapi{\'n}ski and Stuart Barnes",
  title =        "{Tabu Search} with two approaches to parallel flowshop
                 evaluation on {CUDA} platform",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "71",
  number =       "6",
  pages =        "802--811",
  month =        jun,
  year =         "2011",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2011.02.006",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Feb 25 09:11:32 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731511000384",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{delaAsuncion:2011:SOL,
  author =       "Marc de la Asunci{\'o}n and Jos{\'e} M. Mantas and
                 Manuel J. Castro",
  title =        "Simulation of one-layer shallow water systems on
                 multicore and {CUDA} architectures",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "58",
  number =       "2",
  pages =        "206--214",
  month =        nov,
  year =         "2011",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 13 15:25:06 MST 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=58&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=58&issue=2&spage=206",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Dohi:2011:GIO,
  author =       "Keisuke Dohi and Yuichiro Shibata and Kiyoshi Oguri
                 and Takafumi Fujimoto",
  title =        "{GPU} implementation and optimization of
                 electromagnetic simulation using the {FDTD} method for
                 antenna designing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "26--31",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082163",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper describes electromagnetical field
                 simulation using the 3D-FDTD method for antenna
                 designing on a CUDA-compatible GPU. We use the Split
                 Perfectly Matched Layer as an absorbing boundary
                 condition. As is well known, the 3D-FDTD method is a
                 kind of stencil computation and is considered better at
                 GPU implementation. In order to find the best blocking
                 size for the target GPU architecture, we empirically
                 explore a design space of blocking size. We also
                 propose a kernel fusing method as one of the efficient
                 optimization methods, which improves the total
                 performance about 10\% at the cost of a small increase
                 in memory usage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Dotsenko:2011:ATF,
  author =       "Yuri Dotsenko and Sara S. Baghsorkhi and Brandon Lloyd
                 and Naga K. Govindaraju",
  title =        "Auto-tuning of {Fast Fourier Transform} on graphics
                 processors",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "257--266",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941589",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We present an auto-tuning framework for FFTs on
                 graphics processors (GPUs). Due to complex design of
                 the memory and compute subsystems on GPUs, the
                 performance of FFT kernels over the range of possible
                 input parameters can vary widely. We generate several
                 variants for each component of the FFT kernel that, for
                 different cases, are likely to perform well. Our
                 auto-tuner composes variants to generate kernels and
                 selects the best ones. We present heuristics to prune
                 the search space and profile only a small fraction of
                 all possible kernels. We compose optimized kernels to
                 improve the performance of larger FFT computations. We
                 implement the system using the NVIDIA CUDA API and
                 compare its performance to the state-of-the-art FFT
                 libraries. On a range of NVIDIA GPUs and input sizes,
                 our auto-tuned FFTs outperform the NVIDIA CUFFT 3.0
                 library by up to 38x and deliver up to 3x higher
                 performance compared to a manually-tuned FFT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Ewedafe:2011:PID,
  author =       "Simon Uzezi Ewedafe and Rio Hirowati Shariffudin",
  title =        "Parallel Implementation of {$2$-D} Telegraphic
                 Equation on {MPI\slash PVM} Cluster",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "39",
  number =       "2",
  pages =        "202--231",
  month =        apr,
  year =         "2011",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Tue Sep 6 21:08:27 MDT 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=39&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=39&issue=2&spage=202",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Filgueira:2011:ACE,
  author =       "Rosa Filgueira and David E. Singh and Jes{\'u}s
                 Carretero and Alejandro Calder{\'o}n and F{\'e}lix
                 Garc{\'\i}a",
  title =        "{Adaptive-CoMPI}: Enhancing {MPI}-Based Applications'
                 Performance and Scalability by using Adaptive
                 Compression",
  journal =      j-IJHPCA,
  volume =       "25",
  number =       "1",
  pages =        "93--114",
  month =        feb,
  year =         "2011",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342010373486",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Sep 6 15:14:36 MDT 2011",
  bibsource =    "http://hpc.sagepub.com/content/25/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/25/1/93.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "July 26, 2010",
}

@Article{Fousek:2011:AFC,
  author =       "Jan Fousek and Ji{\v{r}}i Filipovi{\v{c}} and
                 Matu{\v{s}} Madzin",
  title =        "Automatic fusions of {CUDA--GPU} kernels for parallel
                 map",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "98--99",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082183",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "When implementing a function mapping on the
                 contemporary GPU, several contradictory performance
                 factors affecting distribution of computation into GPU
                 kernels have to be balanced. A decomposition-fusion
                 scheme suggests to decompose the computational problem
                 to be solved by several simple functions implemented as
                 standalone kernels and to fuse some of these functions
                 later into more complex kernels to improve memory
                 locality. In this paper, a prototype of
                 source-to-source compiler automating the fusion phase
                 is presented and the impact of fusions generated by the
                 compiler as well as compiler efficiency is
                 experimentally evaluated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Garcia:2011:KRR,
  author =       "Saturnino Garcia and Donghwan Jeon and Christopher M.
                 Louie and Michael Bedford Taylor",
  title =        "{Kremlin}: rethinking and rebooting {{\tt gprof}} for
                 the multicore age",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "458--469",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Many recent parallelization tools lower the barrier
                 for parallelizing a program, but overlook one of the
                 first questions that a programmer needs to answer:
                 which parts of the program should I spend time
                 parallelizing?\par

                 This paper examines Kremlin, an automatic tool that,
                 given a serial version of a program, will make
                 recommendations to the user as to what regions (e.g.
                 loops or functions) of the program to attack first.
                 Kremlin introduces a novel hierarchical critical path
                 analysis and develops a new metric for estimating the
                 potential of parallelizing a region: self-parallelism.
                 We further introduce the concept of a parallelism
                 planner, which provides a ranked order of specific
                 regions to the programmer that are likely to have the
                 largest performance impact when parallelized. Kremlin
                 supports multiple planner personalities, which allow
                 the planner to more effectively target a particular
                 programming environment or class of machine.\par

                 We demonstrate the effectiveness of one such
                 personality, an OpenMP planner, by comparing versions
                 of programs that are parallelized according to
                 Kremlin's plan against third-party manually
                 parallelized versions. The results show that Kremlin's
                 OpenMP planner is highly effective, producing plans
                 whose performance is typically comparable to, and
                 sometimes much better than, manual parallelization. At
                 the same time, these plans would require that the user
                 parallelize significantly fewer regions of the
                 program.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Gopalakrishnan:2011:FAM,
  author =       "Ganesh Gopalakrishnan and Robert M. Kirby and Stephen
                 Siegel and Rajeev Thakur and William Gropp and Ewing
                 Lusk and Bronis R. De Supinski and Martin Schulz and
                 Greg Bronevetsky",
  title =        "Formal analysis of {MPI}-based parallel programs",
  journal =      j-CACM,
  volume =       "54",
  number =       "12",
  pages =        "82--91",
  month =        dec,
  year =         "2011",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/2043174.2043194",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Tue Nov 29 11:53:53 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/cacm/;
                 http://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Book{Hager:2011:IHP,
  author =       "Georg Hager and Gerhard Wellein",
  title =        "Introduction to high performance computing for
                 scientists and engineers",
  volume =       "7",
  publisher =    pub-CRC,
  address =      pub-CRC:adr,
  pages =        "xxv + 330 + 4",
  year =         "2011",
  ISBN =         "1-4398-1192-X",
  ISBN-13 =      "978-1-4398-1192-4",
  LCCN =         "QA76.88 .H34 2011",
  bibdate =      "Wed Sep 15 13:26:35 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Chapman and Hall/CRC computational science series",
  acknowledgement = ack-nhfb,
  subject =      "high performance computing",
  tableofcontents = "Modern Processors \\
                 Stored-program computer architecture \\
                 General-purpose cache-based microprocessor architecture
                 \\
                 Memory hierarchies \\
                 Multicore processors \\
                 Multithreaded processors \\
                 Vector processors \\
                 \\
                 Basic Optimization Techniques for Serial Code \\
                 Scalar profiling \\
                 Common sense optimizations \\
                 Simple measures, large impact \\
                 The role of compilers \\
                 C++ optimizations \\
                 \\
                 Data Access Optimization \\
                 Balance analysis and lightspeed estimates \\
                 Storage order \\
                 Case study: The Jacobi algorithm \\
                 Case study: Dense matrix transpose \\
                 Algorithm classification and access optimizations \\
                 Case study: Sparse matrix-vector multiply \\
                 \\
                 Parallel Computers \\
                 Taxonomy of parallel computing paradigms \\
                 Shared-memory computers \\
                 Distributed-memory computers \\
                 Hierarchical (hybrid) systems \\
                 Networks \\
                 \\
                 Basics of Parallelization \\
                 Why parallelize? \\
                 Parallelism \\
                 Parallel scalability \\
                 \\
                 Shared-Memory Parallel Programming with OpenMP \\
                 Short introduction to OpenMP \\
                 Case study: OpenMP-parallel Jacobi algorithm \\
                 Advanced OpenMP: Wavefront parallelization \\
                 \\
                 Efficient OpenMP Programming \\
                 Profiling OpenMP programs \\
                 Performance pitfalls \\
                 Case study: Parallel sparse matrix-vector multiply \\
                 \\
                 Locality Optimizations on ccNUMA Architectures \\
                 Locality of access on ccNUMA \\
                 Case study: ccNUMA optimization of sparse MVM \\
                 Placement pitfalls \\
                 ccNUMA issues with C++ \\
                 \\
                 Distributed-Memory Parallel Programming with MPI \\
                 Message passing \\
                 A short introduction to MPI \\
                 Example: MPI parallelization of a Jacobi solver \\
                 \\
                 Efficient MPI Programming \\
                 MPI performance tools \\
                 Communication parameters \\
                 Synchronization, serialization, contention \\
                 Reducing communication overhead \\
                 Understanding intranode point-to-point communication
                 \\
                 Hybrid Parallelization with MPI and OpenMP \\
                 Basic MPI/OpenMP programming models \\
                 MPI taxonomy of thread interoperability \\
                 Hybrid decomposition and mapping \\
                 Potential benefits and drawbacks of hybrid programming
                 \\
                 Appendix A: Topology and Affinity in Multicore
                 Environments \\
                 Appendix B: Solutions to the Problems \\
                 \\
                 Bibliography \\
                 \\
                 Index",
}

@Article{Han:2011:HHL,
  author =       "Tianyi David Han and Tarek S. Abdelrahman",
  title =        "{hiCUDA}: High-Level {GPGPU} Programming",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "22",
  number =       "1",
  pages =        "78--90",
  month =        jan,
  year =         "2011",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2010.62",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Feb 25 14:08:57 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Harvey:2011:STP,
  author =       "M. J. Harvey and G. {De Fabritiis}",
  title =        "{Swan}: a tool for porting {CUDA} programs to
                 {OpenCL}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "182",
  number =       "4",
  pages =        "1093--1099",
  month =        apr,
  year =         "2011",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2010.12.052",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 10:10:57 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465511000117",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Hawick:2011:HSL,
  author =       "K. A. Hawick and D. P. Playne",
  title =        "Hypercubic storage layout and transforms in arbitrary
                 dimensions using {GPUs} and {CUDA}",
  journal =      j-CCPE,
  volume =       "23",
  number =       "10",
  pages =        "1027--1050",
  month =        jul,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1628",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:56 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "28 Aug 2010",
}

@Article{Hawick:2011:RLS,
  author =       "K. A. Hawick and A. Leist and D. P. Playne",
  title =        "Regular Lattice and Small-World Spin Model Simulations
                 Using {CUDA} and {GPUs}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "39",
  number =       "2",
  pages =        "183--201",
  month =        apr,
  year =         "2011",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Tue Sep 6 21:08:27 MDT 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=39&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=39&issue=2&spage=183",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Hinde:2011:QMD,
  author =       "Robert J. Hinde",
  title =        "{QSATS}: {MPI}-driven quantum simulations of atomic
                 solids at zero temperature",
  journal =      j-COMP-PHYS-COMM,
  volume =       "182",
  number =       "11",
  pages =        "2339--2349",
  month =        nov,
  year =         "2011",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2011.04.024",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 10:11:00 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465511001615",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Hoefler:2011:SPT,
  author =       "Torsten Hoefler and Rolf Rabenseifner and Hubert
                 Ritzdorf and Bronis R. de Supinski and Rajeev Thakur
                 and Jesper Larsson Tr{\"a}ff",
  title =        "The scalable process topology interface of {MPI 2.2}",
  journal =      j-CCPE,
  volume =       "23",
  number =       "4",
  pages =        "293--310",
  day =          "25",
  month =        mar,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1643",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:53 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "28 Aug 2010",
}

@Article{Hong:2011:ACG,
  author =       "Sungpack Hong and Sang Kyun Kim and Tayo Oguntebi and
                 Kunle Olukotun",
  title =        "Accelerating {CUDA} graph algorithms at maximum warp",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "267--276",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941590",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Graphs are powerful data representations favored in
                 many computational domains. Modern GPUs have recently
                 shown promising results in accelerating computationally
                 challenging graph problems but their performance
                 suffered heavily when the graph structure is highly
                 irregular, as most real-world graphs tend to be. In
                 this study, we first observe that the poor performance
                 is caused by work imbalance and is an artifact of a
                 discrepancy between the GPU programming model and the
                 underlying GPU architecture.We then propose a novel
                 virtual warp-centric programming method that exposes
                 the traits of underlying GPU architectures to users.
                 Our method significantly improves the performance of
                 applications with heavily imbalanced workloads, and
                 enables trade-offs between workload imbalance and ALU
                 underutilization for fine-tuning the performance. Our
                 evaluation reveals that our method exhibits up to 9x
                 speedup over previous GPU algorithms and 12x over
                 single thread CPU execution on irregular graphs. When
                 properly configured, it also yields up to 30\%
                 improvement over previous GPU algorithms on regular
                 graphs. In addition to performance gains on graph
                 algorithms, our programming method achieves 1.3x to
                 15.1x speedup on a set of GPU benchmark applications.
                 Our study also confirms that the performance gap
                 between GPUs and other multi-threaded CPU graph
                 implementations is primarily due to the large
                 difference in memory bandwidth.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@InProceedings{Houzeaux:2011:HMO,
  author =       "G. Houzeaux and M. V{\'a}zquez and X. S{\'a}ez and J.
                 M. Cela",
  title =        "Hybrid {MPI--OpenMP} performance in massively parallel
                 computational fluid dynamics",
  crossref =     "Tromeur-Dervout:2011:PCF",
  volume =       "74",
  pages =        "293--297",
  year =         "2011",
  DOI =          "https://doi.org/10.1007/978-3-642-14438-7_31",
  bibdate =      "Sat Dec 22 08:34:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-642-14438-7_31",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-14438-7",
  book-URL =     "http://www.springerlink.com/content/978-3-642-14438-7",
}

@Article{Hussain:2011:PIA,
  author =       "Masroor Hussain and Muhammad Abid and Mushtaq Ahmad
                 and Ashfaq Khokhar and Arif Masud",
  title =        "A Parallel Implementation of {ALE} Moving Mesh
                 Technique for {FSI} Problems using {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "39",
  number =       "6",
  pages =        "717--745",
  month =        dec,
  year =         "2011",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Tue Sep 6 21:08:54 MDT 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=39&issue=6;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=39&issue=6&spage=717",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Jin:2011:HPC,
  author =       "Haoqiang Jin and Dennis Jespersen and Piyush Mehrotra
                 and Rupak Biswas and Lei Huang and Barbara Chapman",
  title =        "High performance computing using {MPI} and {OpenMP} on
                 multi-core parallel systems",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "37",
  number =       "9",
  pages =        "562--575",
  month =        sep,
  year =         "2011",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.02.002",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111000159",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Kalentev:2011:CCL,
  author =       "Oleksandr Kalentev and Abha Rai and Stefan Kemnitz and
                 Ralf Schneider",
  title =        "Connected component labeling on a {$2$D} grid using
                 {CUDA}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "71",
  number =       "4",
  pages =        "615--620",
  month =        apr,
  year =         "2011",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Feb 25 19:11:50 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Kim:2011:ASC,
  author =       "Jungwon Kim and Honggyu Kim and Joo Hwan Lee and
                 Jaejin Lee",
  title =        "Achieving a single compute device image in {OpenCL}
                 for multiple {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "277--288",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941591",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In this paper, we propose an OpenCL framework that
                 combines multiple GPUs and treats them as a single
                 compute device. Providing a single virtual compute
                 device image to the user makes an OpenCL application
                 written for a single GPU portable to the platform that
                 has multiple GPU devices. It also makes the application
                 exploit full computing power of the multiple GPU
                 devices and the total amount of GPU memories available
                 in the platform. Our OpenCL framework automatically
                 distributes at run-time the OpenCL kernel written for a
                 single GPU into multiple CUDA kernels that execute on
                 the multiple GPU devices. It applies a run-time memory
                 access range analysis to the kernel by performing a
                 sampling run and identifies an optimal workload
                 distribution for the kernel. To achieve a single
                 compute device image, the runtime maintains virtual
                 device memory that is allocated in the main memory. The
                 OpenCL runtime treats the memory as if it were the
                 memory of a single GPU device and keeps it consistent
                 to the memories of the multiple GPU devices. Our
                 OpenCL-C-to-C translator generates the sampling code
                 from the OpenCL kernel code and OpenCL-C-to-CUDA-C
                 translator generates the CUDA kernel code for the
                 distributed OpenCL kernel. We show the effectiveness of
                 our OpenCL framework by implementing the OpenCL runtime
                 and two source-to-source translators. We evaluate its
                 performance with a system that contains 8 GPUs using 11
                 OpenCL benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Kolonias:2011:DIE,
  author =       "Vasileios Kolonias and Artemios G. Voyiatzis and
                 George Goulas and Efthymios Housos",
  title =        "Design and implementation of an efficient integer
                 count sort in {CUDA GPUs}",
  journal =      j-CCPE,
  volume =       "23",
  number =       "18",
  pages =        "2365--2381",
  day =          "25",
  month =        dec,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1776",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:09:01 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "22 Jun 2011",
}

@Article{Li:2011:FSM,
  author =       "Guodong Li and Robert Palmer and Michael DeLisi and
                 Ganesh Gopalakrishnan and Robert M. Kirby",
  title =        "Formal specification of {MPI 2.0}: {Case} study in
                 specifying a practical concurrent programming {API}",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "76",
  number =       "2",
  pages =        "65--81",
  day =          "1",
  month =        feb,
  year =         "2011",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Fri Apr 1 18:39:40 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/01676423",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
}

@Article{Liao:2011:DEM,
  author =       "Wei-keng Liao",
  title =        "Design and Evaluation of {MPI} File Domain
                 Partitioning Methods under Extent-Based File Locking
                 Protocol",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "22",
  number =       "2",
  pages =        "260--272",
  month =        feb,
  year =         "2011",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2010.74",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Feb 25 14:08:57 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Lim:2011:ATC,
  author =       "Min Yeol Lim and Vincent W. Freeh and David K.
                 Lowenthal",
  title =        "Adaptive, transparent {CPU} scaling algorithms
                 leveraging inter-node {MPI} communication regions",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "37",
  number =       "10--11",
  pages =        "667--683",
  month =        oct # "\slash " # nov,
  year =         "2011",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.07.001",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:36 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111000871",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Liu:2011:CBA,
  author =       "Weiguo Liu and Bertil Schmidt and Wolfgang
                 Muller-Wittig",
  title =        "{CUDA-BLASTP}: Accelerating {BLASTP} on {CUDA}-Enabled
                 Graphics Hardware",
  journal =      j-TCBB,
  volume =       "8",
  number =       "6",
  pages =        "1678--1684",
  month =        nov,
  year =         "2011",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2011.33",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Sun Nov 6 06:45:50 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Michailidis:2011:PDM,
  author =       "Panagiotis D. Michailidis and Konstantinos G.
                 Margaritis",
  title =        "Parallel direct methods for solving the system of
                 linear equations with pipelining on a multicore using
                 {OpenMP}",
  journal =      j-J-COMPUT-APPL-MATH,
  volume =       "236",
  number =       "3",
  pages =        "326--341",
  day =          "1",
  month =        sep,
  year =         "2011",
  CODEN =        "JCAMDI",
  ISSN =         "0377-0427 (print), 1879-1778 (electronic)",
  ISSN-L =       "0377-0427",
  bibdate =      "Sat Feb 25 13:24:37 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputapplmath2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0377042711004183",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational and Applied Mathematics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03770427",
}

@Article{Mininni:2011:HMO,
  author =       "Pablo D. Mininni and Duane Rosenberg and Raghu Reddy
                 and Annick Pouquet",
  title =        "A hybrid {MPI--OpenMP} scheme for scalable parallel
                 pseudospectral computations for fluid turbulence",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "37",
  number =       "6--7",
  pages =        "316--326",
  month =        jun # "\slash " # jul,
  year =         "2011",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.05.004",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111000512",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Mokbel:2011:ASR,
  author =       "Mohammed F. Mokbel and Robert D. Kent and Michael
                 Wong",
  title =        "An Abstract Semantically Rich Compiler Collocative and
                 Interpretative Model for {OpenMP} Programs",
  journal =      j-COMP-J,
  volume =       "54",
  number =       "8",
  pages =        "1325--1343",
  month =        aug,
  year =         "2011",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxr029",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Wed Aug 17 16:34:11 MDT 2011",
  bibsource =    "http://comjnl.oxfordjournals.org/content/54/8.toc;
                 http://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://comjnl.oxfordjournals.org/content/54/8/1325.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "April 5, 2011",
}

@Article{Pennycook:2011:PAH,
  author =       "S. J. Pennycook and S. D. Hammond and S. A. Jarvis and
                 G. R. Mudalige",
  title =        "Performance analysis of a hybrid {MPI\slash CUDA}
                 implementation of the {NASLU} benchmark",
  journal =      j-SIGMETRICS,
  volume =       "38",
  number =       "4",
  pages =        "23--29",
  month =        mar,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1964218.1964223",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Apr 1 23:02:55 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  remark =       "Special issue on the 1st international workshop on
                 performance modeling, benchmarking and simulation of
                 high performance computing systems (PMBS 10).",
}

@Article{Peters:2011:FPC,
  author =       "Hagen Peters and Ole Schulz-Hildebrandt and Norbert
                 Luttenberger",
  title =        "Fast in-place, comparison-based sorting with {CUDA}: a
                 study with bitonic sort",
  journal =      j-CCPE,
  volume =       "23",
  number =       "7",
  pages =        "681--693",
  month =        may,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1686",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:55 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 Jan 2011",
}

@Article{Plimpton:2011:MML,
  author =       "Steven J. Plimpton and Karen D. Devine",
  title =        "{MapReduce} in {MPI} for large-scale graph
                 algorithms",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "37",
  number =       "9",
  pages =        "610--632",
  month =        sep,
  year =         "2011",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.02.004",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111000172",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Siegel:2011:AFV,
  author =       "Stephen F. Siegel and Timothy K. Zirkel",
  title =        "Automatic formal verification of {MPI}-based parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "309--310",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941603",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The Toolkit for Accurate Scientific Software (TASS) is
                 a suite of tools for the formal verification of
                 MPI-based parallel programs used in computational
                 science. TASS can verify various safety properties as
                 well as compare two programs for functional
                 equivalence. The TASS front end takes an integer $ n
                 \geq 1 $ and a C/MPI program, and constructs an
                 abstract model of the program with $n$ processes.
                 Procedures, structs, (multi-dimensional) arrays,
                 heap-allocated data, pointers, and pointer arithmetic
                 are all representable in a TASS model. The model is
                 then explored using symbolic execution and explicit
                 state space enumeration. A number of techniques are
                 used to reduce the time and memory consumed. A variety
                 of realistic MPI programs have been verified with TASS,
                 including Jacobi iteration and manager-worker type
                 programs, and some subtle defects have been discovered.
                 TASS is written in Java and is available from
                 \path=http://vsl.cis.udel.edu/tass= under the Gnu
                 Public License.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Sintorn:2011:EAF,
  author =       "Erik Sintorn and Ola Olsson and Ulf Assarsson",
  title =        "An efficient alias-free shadow algorithm for opaque
                 and transparent objects using per-triangle shadow
                 volumes",
  journal =      j-TOG,
  volume =       "30",
  number =       "6",
  pages =        "153:1--153:??",
  month =        dec,
  year =         "2011",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/2070781.2024187",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Mon Dec 19 15:59:18 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "This paper presents a novel method for generating
                 pixel-accurate shadows from point light-sources in
                 real-time. The new method is able to quickly cull
                 pixels that are not in shadow and to trivially accept
                 large chunks of pixels thanks mainly to using the whole
                 triangle shadow volume as a primitive, instead of
                 rendering the shadow quads independently as in the
                 classic Shadow-Volume algorithm. Our CUDA
                 implementation outperforms z-fail consistently and
                 surpasses z-pass at high resolutions, although these
                 latter two are hardware accelerated, while inheriting
                 none of the robustness issues associated with these
                 methods. Another, perhaps even more important property
                 of our algorithm, is that it requires no pre-processing
                 or identification of silhouette edges and so robustly
                 and efficiently handles arbitrary triangle soups.",
  acknowledgement = ack-nhfb,
  articleno =    "153",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@InProceedings{Smelyanskiy:2011:HPL,
  author =       "Mikhail Smelyanskiy and Karthikeyan Vaidyanathan and
                 Jee Choi and B{\'a}lint Jo{\'o} and Jatin Chhugani and
                 Michael A. Clark and Pradeep Dubey",
  title =        "High-performance lattice {QCD} for multi-core based
                 parallel systems using a cache-friendly hybrid
                 threaded-{MPI} approach",
  crossref =     "Lathrop:2011:SPI",
  pages =        "69:1--69:11",
  year =         "2011",
  DOI =          "https://doi.org/10.1145/2063384.2063477",
  bibdate =      "Fri Dec 16 11:05:47 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
  articleno =    "69",
}

@Article{Stpiczynski:2011:SKB,
  author =       "Przemyslaw Stpiczy{\'n}ski and Joanna Potiopa",
  title =        "Solving a kind of boundary-value problem for ordinary
                 differential equations using {Fermi} --- The next
                 generation {CUDA} computing architecture",
  journal =      j-J-COMPUT-APPL-MATH,
  volume =       "236",
  number =       "3",
  pages =        "384--393",
  day =          "1",
  month =        sep,
  year =         "2011",
  CODEN =        "JCAMDI",
  ISSN =         "0377-0427 (print), 1879-1778 (electronic)",
  ISSN-L =       "0377-0427",
  bibdate =      "Sat Feb 25 13:24:37 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputapplmath2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0377042711004237",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational and Applied Mathematics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03770427",
}

@Article{Szalay:2011:FCD,
  author =       "Zs{\'o}fia Szalay and J{\'a}nos Rohonczy",
  title =        "Fast calculation of {DNMR} spectra on {CUDA}-enabled
                 graphics card",
  journal =      j-J-COMPUT-CHEM,
  volume =       "32",
  number =       "7",
  pages =        "1262--1270",
  month =        may,
  year =         "2011",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.21706",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Thu Nov 29 14:55:32 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  onlinedate =   "29 Nov 2010",
}

@Article{vanderLaan:2011:AWL,
  author =       "Wladimir J. van der Laan and Andrei C. Jalba and Jos
                 B. T. M. Roerdink",
  title =        "Accelerating Wavelet Lifting on Graphics Hardware
                 Using {CUDA}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "22",
  number =       "1",
  pages =        "132--146",
  month =        jan,
  year =         "2011",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2010.143",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Feb 25 14:08:57 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Wittenbrink:2011:FGG,
  author =       "Craig M. Wittenbrink and Emmett Kilgariff and Arjun
                 Prabhu",
  title =        "{Fermi GF100 GPU} Architecture",
  journal =      j-IEEE-MICRO,
  volume =       "31",
  number =       "2",
  pages =        "50--59",
  month =        mar # "\slash " # apr,
  year =         "2011",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2011.24",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Tue Apr 26 13:50:28 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The Fermi GF100 is a GPU architecture that provides
                 several new capabilities beyond the Nvidia GT200 or
                 Tesla architecture. The Fermi architecture offers up to
                 512 CUDA cores and special features for gaming and
                 high-performance computing. This article describes the
                 GPU's new capabilities for tessellation, physics
                 processing, and computational graphics.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
  keywords =     "Hot Chips 22 conference proceedings",
}

@Article{Wong:2011:EMS,
  author =       "Hon-Cheng Wong and Un-Hong Wong and Xueshang Feng and
                 Zesheng Tang",
  title =        "Efficient magnetohydrodynamic simulations on graphics
                 processing units with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "182",
  number =       "10",
  pages =        "2132--2160",
  month =        oct,
  year =         "2011",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2011.05.011",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 10:11:00 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465511001676",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Wu:2011:PCH,
  author =       "Xingfu Wu and Valerie Taylor",
  title =        "Performance characteristics of hybrid {MPI\slash
                 OpenMP} implementations of {NAS} parallel benchmarks
                 {SP} and {BT} on large-scale multicore supercomputers",
  journal =      j-SIGMETRICS,
  volume =       "38",
  number =       "4",
  pages =        "56--62",
  month =        mar,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1964218.1964228",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Apr 1 23:02:55 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  remark =       "Special issue on the 1st international workshop on
                 performance modeling, benchmarking and simulation of
                 high performance computing systems (PMBS 10).",
}

@Article{Yang:2011:HCO,
  author =       "Chao-Tung Yang and Chih-Lin Huang and Cheng-Fang Lin",
  title =        "Hybrid {CUDA}, {OpenMP}, and {MPI} parallel
                 programming on multicore {GPU} clusters",
  journal =      j-COMP-PHYS-COMM,
  volume =       "182",
  number =       "1",
  pages =        "266--269",
  month =        jan,
  year =         "2011",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2010.06.035",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 10:10:55 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465510002262",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Yang:2011:PBP,
  author =       "Chao-Tung Yang and Chao-Chin Wu and Jen-Hsiang Chang",
  title =        "Performance-based parallel loop self-scheduling using
                 hybrid {OpenMP} and {MPI} programming on multicore
                 {SMP} clusters",
  journal =      j-CCPE,
  volume =       "23",
  number =       "8",
  pages =        "721--744",
  day =          "10",
  month =        jun,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1627",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:55 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "26 Sep 2010",
}

@Article{Yilmaz:2011:RMS,
  author =       "Erdal Yilmaz and Eray Molla and Cansin Yildiz and
                 Veysi Isler",
  title =        "Realistic modeling of spectator behavior for soccer
                 videogames with {CUDA}",
  journal =      j-COMPUTERS-AND-GRAPHICS,
  volume =       "35",
  number =       "6",
  pages =        "1063--1069",
  month =        dec,
  year =         "2011",
  CODEN =        "COGRD2",
  DOI =          "https://doi.org/10.1016/j.cag.2011.10.001",
  ISSN =         "0097-8493 (print), 1873-7684 (electronic)",
  ISSN-L =       "0097-8493",
  bibdate =      "Mon Feb 13 16:42:03 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compgraph.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/00978493",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0097849311001476",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers \& Graphics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00978493",
}

@InProceedings{Zhai:2011:CVH,
  author =       "Yan Zhai and Mingliang Liu and Jidong Zhai and
                 Xiaosong Ma and Wenguang Chen",
  title =        "Cloud versus in-house cluster: evaluating {Amazon}
                 cluster compute instances for running {MPI}
                 applications",
  crossref =     "ACM:2011:SSP",
  pages =        "11:1--11:10",
  year =         "2011",
  DOI =          "https://doi.org/10.1145/2063348.2063363",
  bibdate =      "Fri Dec 16 11:19:26 MST 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  keywords =     "Amazon EC2 CCI; IB cluster (InfiniBand)",
}

@Article{Zheng:2011:GLO,
  author =       "Mai Zheng and Vignesh T. Ravi and Feng Qin and Gagan
                 Agrawal",
  title =        "{GRace}: a low-overhead mechanism for detecting data
                 races in {GPU} programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "135--146",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941574",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In recent years, GPUs have emerged as an extremely
                 cost-effective means for achieving high performance.
                 Many application developers, including those with no
                 prior parallel programming experience, are now trying
                 to scale their applications using GPUs. While languages
                 like CUDA and OpenCL have eased GPU programming for
                 non-graphical applications, they are still explicitly
                 parallel languages. All parallel programmers,
                 particularly the novices, need tools that can help
                 ensuring the correctness of their programs. Like any
                 multithreaded environment, data races on GPUs can
                 severely affect the program reliability. Thus, tool
                 support for detecting race conditions can significantly
                 benefit GPU application developers. Existing approaches
                 for detecting data races on CPUs or GPUs have one or
                 more of the following limitations: (1) being ill-suited
                 for handling non-lock synchronization primitives on
                 GPUs; (2) lacking of scalability due to the state
                 explosion problem; (3) reporting many false positives
                 because of simplified modeling; and/or (4) incurring
                 prohibitive runtime and space overhead. In this paper,
                 we propose GRace, a new mechanism for detecting races
                 in GPU programs that combines static analysis with a
                 carefully designed dynamic checker for logging and
                 analyzing information at runtime. Our design utilizes
                 GPUs memory hierarchy to log runtime data accesses
                 efficiently. To improve the performance, GRace
                 leverages static analysis to reduce the number of
                 statements that need to be instrumented. Additionally,
                 by exploiting the knowledge of thread scheduling and
                 the execution model in the underlying GPUs, GRace can
                 accurately detect data races with no false positives
                 reported. Based on the above idea, we have built a
                 prototype of GRace with two schemes, i.e., GRace-stmt
                 and GRace-addr, for NVIDIA GPUs. Both schemes are
                 integrated with the same static analysis. We have
                 evaluated GRace-stmt and GRace-addr with three data
                 race bugs in three GPU kernel functions and also have
                 compared them with the existing approach, referred to
                 as B-tool. Our experimental results show that both
                 schemes of GRace are effective in detecting all
                 evaluated cases with no false positives, whereas Btool
                 reports many false positives for one evaluated case. On
                 the one hand, GRace-addr incurs low runtime overhead,
                 i.e., 22-116\%, and low space overhead, i.e., 9-18MB,
                 for the evaluated kernels. On the other hand,
                 GRace-stmt offers more help in diagnosing data races
                 with larger overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Agathos:2012:TBE,
  author =       "Spiros N. Agathos and Panagiotis E. Hadjidoukas and
                 Vassilios V. Dimakopoulos",
  title =        "Task-Based Execution of Nested {OpenMP} Loops",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "210--222",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_16",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_16/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Amritkar:2012:OPF,
  author =       "Amit Amritkar and Danesh Tafti and Rui Liu and Rick
                 Kufrin and Barbara Chapman",
  title =        "{OpenMP} parallelism for fluid and fluid-particulate
                 systems",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "9",
  pages =        "501--517",
  month =        sep,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2012.05.005",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jul 30 14:28:54 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819112000476",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Andersch:2012:PPE,
  author =       "Michael Andersch and Chi Ching Chi and Ben Juurlink",
  title =        "Programming parallel embedded and consumer
                 applications in {OpenMP} superscalar",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "281--282",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145854",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we evaluate the performance and
                 usability of the parallel programming model OpenMP
                 Superscalar (OmpSs), apply it to 10 different
                 benchmarks and compare its performance with
                 corresponding POSIX threads implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Misc{Anonymous:2012:CTC,
  author =       "Anonymous",
  title =        "{CUDA Toolkit 5.0 CURAND} Guide",
  howpublished = "Web document",
  year =         "2012",
  bibdate =      "Sat Feb 08 18:16:05 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://docs.nvidia.com/cuda/pdf/CURAND_Library.pdf",
  acknowledgement = ack-nhfb,
  keywords =     "random-number generator",
}

@Article{Baskaran:2012:ACO,
  author =       "Muthu Manikandan Baskaran and Nicolas Vasilache and
                 Benoit Meister and Richard Lethin",
  title =        "Automatic communication optimizations through memory
                 reuse strategies",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "277--278",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145852",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Modern parallel architectures are emerging with
                 sophisticated hardware consisting of hierarchically
                 placed parallel processors and memories. The properties
                 of memories in a system vary wildly, not only
                 quantitatively (size, latency, bandwidth, number of
                 banks) but also qualitatively (scratchpad, cache).
                 Along with the emergence of such architectures comes
                 the need for effectively utilizing the parallel
                 processors and properly managing data movement across
                 memories to improve memory bandwidth and hide data
                 transfer latency. In this paper, we describe some of
                 the high-level optimizations that are targeted at the
                 improvement of memory performance in the R-Stream
                 compiler, a high-level source-to-source automatic
                 parallelizing compiler. We direct our focus in this
                 paper on optimizing communications (data transfers) by
                 improving memory reuse at various levels of an explicit
                 memory hierarchy. This general concept is well-suited
                 to the hardware properties of GPGPUs, which is the
                 architecture that we concentrate on for this paper. We
                 apply our techniques and obtain performance improvement
                 on various stencil kernels including an important
                 iterative stencil kernel in seismic processing
                 applications where the performance is comparable to
                 that of the state-of-the-art implementation of the
                 kernel by a CUDA expert.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Bawidamann:2012:ETO,
  author =       "Uwe Bawidamann and Marco Nehmeier",
  title =        "Expression Templates and {OpenCL}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7204",
  pages =        "71--80",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-31500-8_8",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:26:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-31500-8_8/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-31500-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-31500-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Berg:2012:FCL,
  author =       "Bernd A. Berg and Hao Wu",
  title =        "{Fortran} code for {$ {\rm SU}(3) $} lattice gauge
                 theory with and without {MPI} checkerboard
                 parallelization",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "10",
  pages =        "2145--2157",
  month =        oct,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2012.03.021",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu Jun 28 15:53:26 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465512001269",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Bergstrom:2012:NDP,
  author =       "Lars Bergstrom and John Reppy",
  title =        "Nested data-parallelism on the {GPU}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "247--258",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics processing units (GPUs) provide both memory
                 bandwidth and arithmetic performance far greater than
                 that available on CPUs but, because of their
                 Single-Instruction-Multiple-Data (SIMD) architecture,
                 they are hard to program. Most of the programs ported
                 to GPUs thus far use traditional data-level
                 parallelism, performing only operations that operate
                 uniformly over vectors. NESL is a first-order
                 functional language that was designed to allow
                 programmers to write irregular-parallel programs ---
                 such as parallel divide-and-conquer algorithms --- for
                 wide-vector parallel computers. This paper presents our
                 port of the NESL implementation to work on GPUs and
                 provides empirical evidence that nested
                 data-parallelism (NDP) on GPUs significantly
                 outperforms CPU-based implementations and matches or
                 beats newer GPU languages that support only flat
                 parallelism. While our performance does not match that
                 of hand-tuned CUDA programs, we argue that the
                 notational conciseness of NESL is worth the loss in
                 performance. This work provides the first language
                 implementation that directly supports NDP on a GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Berka:2012:PET,
  author =       "Tobias Berka and Helge Hagenauer and Marian
                 Vajter{\v{s}}ic",
  title =        "Portable Explicit Threading and Concurrent Programming
                 for {MPI} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7204",
  pages =        "81--90",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-31500-8_9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:26:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-31500-8_9/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-31500-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-31500-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Betts:2012:GVG,
  author =       "Adam Betts and Nathan Chong and Alastair Donaldson and
                 Shaz Qadeer and Paul Thomson",
  title =        "{GPUVerify}: a verifier for {GPU} kernels",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "113--132",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a technique for verifying race- and
                 divergence-freedom of GPU kernels that are written in
                 mainstream kernel programming languages such as OpenCL
                 and CUDA. Our approach is founded on a novel formal
                 operational semantics for GPU programming termed
                 synchronous, delayed visibility (SDV) semantics. The
                 SDV semantics provides a precise definition of barrier
                 divergence in GPU kernels and allows kernel
                 verification to be reduced to analysis of a sequential
                 program, thereby completely avoiding the need to reason
                 about thread interleavings, and allowing existing
                 modular techniques for program verification to be
                 leveraged. We describe an efficient encoding for data
                 race detection and propose a method for automatically
                 inferring loop invariants required for verification. We
                 have implemented these techniques as a practical
                 verification tool, GPUVerify, which can be applied
                 directly to OpenCL and CUDA source code. We evaluate
                 GPUVerify with respect to a set of 163 kernels drawn
                 from public and commercial sources. Our evaluation
                 demonstrates that GPUVerify is capable of efficient,
                 automatic verification of a large number of real-world
                 kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Bihari:2012:CIT,
  author =       "Barna L. Bihari and Michael Wong and Amy Wang and
                 Bronis R. de Supinski and Wang Chen",
  title =        "A Case for Including Transactions in {OpenMP} {II}:
                 Hardware Transactional Memory",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "44--58",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_4",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_4/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Blattner:2012:PSC,
  author =       "Timothy Blattner and Shiming Yang",
  title =        "Performance study on {CUDA GPUs} for parallelizing the
                 local ensemble transformed {Kalman} filter algorithm",
  journal =      j-CCPE,
  volume =       "24",
  number =       "2",
  pages =        "167--177",
  month =        feb,
  year =         "2012",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1859",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Apr 4 09:18:00 MDT 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "17 Oct 2011",
}

@Article{Broquedis:2012:LEO,
  author =       "Fran{\c{c}}ois Broquedis and Thierry Gautier and
                 Vincent Danjean",
  title =        "{libOMP}, an Efficient {OpenMP} Runtime System for
                 Both Fork-Join and Data Flow Paradigms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "102--115",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_8",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_8/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bruning:2012:MFT,
  author =       "Ulrich Br{\"u}ning",
  title =        "{MPI} Functions and Their Impact on Interconnect
                 Hardware",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "10--10",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_2",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/accesspage/chapter/10.1007/978-3-642-33518-1_2",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bureddy:2012:OGM,
  author =       "D. Bureddy and H. Wang and A. Venkatesh and S. Potluri
                 and D. K. Panda",
  title =        "{OMB-GPU}: a Micro-Benchmark Suite for Evaluating
                 {MPI} Libraries on {GPU} Clusters",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "110--120",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_16",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_16/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Busa:2012:ACO,
  author =       "J{\'a}n {Busa, Jr.} and Shura Hayryan and Ming-Chya Wu
                 and J{\'a}n Busa and Chin-Kun Hu",
  title =        "{ARVO-CL}: the {OpenCL} version of the {ARVO} package
                 --- An efficient tool for computing the accessible
                 surface area and the excluded volume of proteins via
                 analytical equations",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "11",
  pages =        "2494--2497",
  month =        nov,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2012.04.019",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jul 27 07:00:54 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465512001580",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Bustamam:2012:FPM,
  author =       "Alhadi Bustamam and Kevin Burrage and Nicholas A.
                 Hamilton",
  title =        "Fast Parallel {Markov} Clustering in Bioinformatics
                 Using Massively Parallel Computing on {GPU} with {CUDA}
                 and {ELLPACK-R} Sparse Format",
  journal =      j-TCBB,
  volume =       "9",
  number =       "3",
  pages =        "679--692",
  month =        may,
  year =         "2012",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2011.68",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Thu Apr 19 17:58:10 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "Markov clustering (MCL) is becoming a key algorithm
                 within bioinformatics for determining clusters in
                 networks. However, with increasing vast amount of data
                 on biological networks, performance and scalability
                 issues are becoming a critical limiting factor in
                 applications. Meanwhile, GPU computing, which uses CUDA
                 tool for implementing a massively parallel computing
                 environment in the GPU card, is becoming a very
                 powerful, efficient, and low-cost option to achieve
                 substantial performance gains over CPU approaches. The
                 use of on-chip memory on the GPU is efficiently
                 lowering the latency time, thus, circumventing a major
                 issue in other parallel computing environments, such as
                 MPI. We introduce a very fast Markov clustering
                 algorithm using CUDA (CUDA-MCL) to perform parallel
                 sparse matrix-matrix computations and parallel sparse
                 Markov matrix normalizations, which are at the heart of
                 MCL. We utilized ELLPACK-R sparse format to allow the
                 effective and fine-grain massively parallel processing
                 to cope with the sparse nature of interaction networks
                 data sets in bioinformatics applications. As the
                 results show, CUDA-MCL is significantly faster than the
                 original MCL running on CPU. Thus, large-scale parallel
                 computation on off-the-shelf desktop-machines, that
                 were previously only possible on supercomputing
                 architectures, can significantly change the way
                 bioinformaticians and biologists deal with their
                 data.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Cabarle:2012:SNP,
  author =       "Francis George C. Cabarle and Henry Adorna and Miguel
                 A. Mart{\'\i}nez",
  title =        "A Spiking Neural {P} System Simulator Based on
                 {CUDA}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7184",
  pages =        "87--103",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-28024-5_8",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:25:48 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-28024-5_8/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-28024-5",
  book-URL =     "http://www.springerlink.com/content/978-3-642-28024-5",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Calotoiu:2012:PID,
  author =       "Alexandru Calotoiu and Christian Siebert and Felix
                 Wolf",
  title =        "Pattern-Independent Detection of Manual Collectives in
                 {MPI} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7484",
  pages =        "28--39",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-32820-6_5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:34 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-32820-6_5/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-32820-6",
  book-URL =     "http://www.springerlink.com/content/978-3-642-32820-6",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cecilia:2012:CSC,
  author =       "Jos{\'e} Mar{\'\i}a Cecilia and Jos{\'e} Manuel
                 Garc{\'\i}a and Manuel Ujald{\'o}n",
  title =        "{CUDA $2$D} Stencil Computations for the {Jacobi}
                 Method",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7133",
  pages =        "173--183",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-28151-8_17",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:24:46 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-28151-8_17/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-28151-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-28151-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chen:2012:PUA,
  author =       "Yifeng Chen and Xiang Cui and Hong Mei",
  title =        "{PARRAY}: a unifying array representation for
                 heterogeneous parallelism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "171--180",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145838",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "This paper introduces a programming interface called
                 PARRAY (or Parallelizing ARRAYs) that supports
                 system-level succinct programming for heterogeneous
                 parallel systems like GPU clusters. The current
                 practice of software development requires combining
                 several low-level libraries like Pthread, OpenMP, CUDA
                 and MPI. Achieving productivity and portability is hard
                 with different numbers and models of GPUs. PARRAY
                 extends mainstream C programming with novel array types
                 of distinct features: (1) the dimensions of an array
                 type are nested in a tree, conceptually reflecting the
                 memory hierarchy; (2) the definition of an array type
                 may contain references to other array types, allowing
                 sophisticated array types to be created for
                 parallelization; (3) threads also form arrays that
                 allow programming in a
                 Single-Program-Multiple-Codeblock (SPMC) style to unify
                 various sophisticated communication patterns. This
                 leads to shorter, more portable and maintainable
                 parallel codes, while the programmer still has control
                 over performance-related features necessary for deep
                 manual optimization. Although the source-to-source code
                 generator only faithfully generates low-level library
                 calls according to the type information, higher-level
                 programming and automatic performance optimization are
                 still possible through building libraries of
                 sub-programs on top of PARRAY. The case study on
                 cluster FFT illustrates a simple 30-line code that 2x
                 outperforms Intel Cluster MKL on the Tianhe-1A system
                 with 7168 Fermi GPUs and 14336 CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Chevitarese:2012:STN,
  author =       "Daniel Salles Chevitarese and Dilza Szwarcman and
                 Marley Vellasco",
  title =        "Speeding Up the Training of Neural Networks with
                 {CUDA} Technology",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7267",
  pages =        "30--38",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-29347-4_4",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:18:50 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012d.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-29347-4_4/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-29347-4",
  book-URL =     "http://www.springerlink.com/content/978-3-642-29347-4",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Collingbourne:2012:STO,
  author =       "Peter Collingbourne and Cristian Cadar and Paul H. J.
                 Kelly",
  title =        "Symbolic Testing of {OpenCL} Code",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7261",
  pages =        "203--218",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-34188-5_18",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:18:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012d.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-34188-5_18/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-34188-5",
  book-URL =     "http://www.springerlink.com/content/978-3-642-34188-5",
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Cui:2012:OOB,
  author =       "Zheng Cui and Lei Xia and Patrick G. Bridges and Peter
                 A. Dinda and John R. Lange",
  title =        "Optimizing overlay-based virtual networking through
                 optimistic interrupts and cut-through forwarding",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "99:1--99:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a029.pdf",
  abstract =     "Overlay-based virtual networking provides a powerful
                 model for realizing virtual distributed and parallel
                 computing systems with strong isolation, portability,
                 and recoverability properties. However, in extremely
                 high throughput and low latency networks, such overlays
                 can suffer from bandwidth and latency limitations,
                 which is of particular concern if we want to apply the
                 model in HPC environments. Through careful study of an
                 existing very high performance overlay-based virtual
                 network system, we have identified two core issues
                 limiting performance: delayed and/or excessive virtual
                 interrupt delivery into guests, and copies between host
                 and guest data buffers done during encapsulation. We
                 respond with two novel optimizations: optimistic,
                 timer-free virtual interrupt injection, and zero-copy
                 cut-through data forwarding. These optimizations
                 improve the latency and bandwidth of the overlay
                 network on 10 Gbps interconnects, resulting in
                 near-native performance for a wide range of
                 microbenchmarks and MPI application benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "99",
}

@Article{Danalis:2012:MCT,
  author =       "Anthony Danalis",
  title =        "{MPI} and Compiler Technology: a Love-Hate
                 Relationship",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "12--13",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_4",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/accesspage/chapter/10.1007/978-3-642-33518-1_4",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{delaAsuncion:2012:MCI,
  author =       "Marc de la Asunci{\'o}n and Jos{\'e} M. Mantas and
                 Manuel J. Castro and E. D. Fern{\'a}ndez-Nieto",
  title =        "An {MPI-CUDA} implementation of an improved {Roe}
                 method for two-layer shallow water systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "72",
  number =       "9",
  pages =        "1065--1072",
  month =        sep,
  year =         "2012",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2011.07.012",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 27 06:43:44 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151100147X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Deshpande:2012:AGC,
  author =       "Vivek Deshpande and Xing Wu and Frank Mueller",
  title =        "Auto-generation of communication benchmark traces",
  journal =      j-SIGMETRICS,
  volume =       "40",
  number =       "2",
  pages =        "99--105",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2381056.2381078",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Nov 9 11:06:40 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "Benchmarks are essential for evaluating HPC hardware
                 and software for petascale machines and beyond. But
                 benchmark creation is a tedious manual process. As a
                 result, benchmarks tend to lag behind the development
                 of complex scientific codes. Our work automates the
                 creation of communication benchmarks. Given an MPI
                 application, we utilize ScalaTrace, a lossless and
                 scalable framework to trace communication operations
                 and execution time while abstracting away the
                 computations. A single trace file that reflects the
                 behavior of all nodes is subsequently expanded to C
                 source code by a novel code generator. This resulting
                 benchmark code is compact, portable, human-readable,
                 and accurately reflects the original application's
                 communication characteristics and performance.
                 Experimental results demonstrate that generated source
                 code of benchmarks preserves both the communication
                 patterns and the run-time behavior of the original
                 application. Such automatically generated benchmarks
                 not only shorten the transition from application
                 development to benchmark extraction but also facilitate
                 code obfuscation, which is essential for benchmark
                 extraction from commercial and restricted
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Deuzeman:2012:LMP,
  author =       "Albert Deuzeman and Siebren Reker and Carsten Urbach
                 and {ETM Collaboration}",
  title =        "{Lemon}: An {MPI} parallel {I/O} library for data
                 encapsulation using {LIME}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "6",
  pages =        "1321--1335",
  month =        jun,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2012.01.016",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Feb 29 07:07:40 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465512000318",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Diaz:2012:CCF,
  author =       "M. J. Castro D{\'\i}az and E. Fern{\'a}ndez-Nieto",
  title =        "A Class of Computationally Fast First Order Finite
                 Volume Solvers: {PVM} Methods",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "34",
  number =       "4",
  pages =        "A2173--A2196",
  month =        "????",
  year =         "2012",
  CODEN =        "SJOCE3",
  DOI =          "https://doi.org/10.1137/100795280",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  bibdate =      "Tue Oct 30 14:49:05 MDT 2012",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SISC/34/4;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/siamjscicomput.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Scientific Computing",
  journal-URL =  "http://epubs.siam.org/sisc",
  onlinedate =   "January 2012",
}

@Article{Didelot:2012:IMC,
  author =       "Sylvain Didelot and Patrick Carribault and Marc
                 P{\'e}rache and William Jalby",
  title =        "Improving {MPI} Communication Overlap with
                 Collaborative Polling",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "37--46",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_9/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dinan:2012:EMC,
  author =       "James Dinan and David Goodell and William Gropp and
                 Rajeev Thakur and Pavan Balaji",
  title =        "Efficient Multithreaded Context {ID} Allocation in
                 {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "57--66",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_11",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_11/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Du:2012:COT,
  author =       "Peng Du and Rick Weber and Piotr Luszczek and
                 Stanimire Tomov and Gregory Peterson and Jack
                 Dongarra",
  title =        "From {CUDA} to {OpenCL}: Towards a
                 performance-portable solution for multi-platform {GPU}
                 programming",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "8",
  pages =        "391--407",
  month =        aug,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.10.002",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Jun 20 17:04:05 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111001335",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dziubak:2012:OOI,
  author =       "Tomasz Dziubak and Jacek Matulewski",
  title =        "An object-oriented implementation of a solver of the
                 time-dependent {Schr{\"o}dinger} equation using the
                 {CUDA} technology",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "3",
  pages =        "800--812",
  month =        mar,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2011.11.026",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 10:11:02 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465511003948",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Eichenberger:2012:DOT,
  author =       "Alexandre E. Eichenberger and Christian Terboven and
                 Michael Wong and Dieter an Mey",
  title =        "The Design of {OpenMP} Thread Affinity",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "15--28",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_2",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_2/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{ElZein:2012:GOC,
  author =       "Ahmed H. {El Zein} and Alistair P. Rendell",
  title =        "Generating optimal {CUDA} sparse matrix--vector
                 product implementations for evolving {GPU} hardware",
  journal =      j-CCPE,
  volume =       "24",
  number =       "1",
  pages =        "3--13",
  month =        jan,
  year =         "2012",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1732",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Jan 16 12:11:17 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "17 Apr 2011",
}

@InProceedings{Fiala:2012:DCS,
  author =       "David Fiala and Frank Mueller and Christian Engelmann
                 and Rolf Riesen and Kurt Ferreira and Ron Brightwell",
  title =        "Detection and correction of silent data corruption for
                 large-scale high-performance computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "78:1--78:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a046.pdf",
  abstract =     "Faults have become the norm rather than the exception
                 for high-end computing clusters. Exacerbating this
                 situation, some of these faults remain undetected,
                 manifesting themselves as silent errors that allow
                 applications to compute incorrect results. This paper
                 studies the potential for redundancy to detect and
                 correct soft errors in MPI message-passing applications
                 while investigating the challenges inherent to
                 detecting soft errors within MPI applications by
                 providing transparent MPI redundancy. By assuming a
                 model wherein corruption in application data manifests
                 itself by producing differing MPI messages between
                 replicas, we study the best suited protocols for
                 detecting and correcting corrupted MPI messages. Using
                 our fault injector, we observe that even a single error
                 can have profound effects on applications by causing a
                 cascading pattern of corruption which in most cases
                 spreads to all other processes. Results indicate that
                 our consistency protocols can successfully protect
                 applications experiencing even high rates of silent
                 data corruption.",
  acknowledgement = ack-nhfb,
  articleno =    "78",
}

@Article{Filgueira:2012:DCD,
  author =       "Rosa Filgueira and Jes{\'u}s Carretero and David E.
                 Singh and Alejandro Calder{\'o}n and Alberto
                 N{\'u}{\~n}ez",
  title =        "{Dynamic--CoMPI}: dynamic optimization techniques for
                 {MPI} parallel applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "59",
  number =       "1",
  pages =        "361--391",
  month =        jan,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 13 15:25:33 MST 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=59&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=59&issue=1&spage=361",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Garcia:2012:DLB,
  author =       "Marta Garcia and Julita Corbalan and Rosa Maria Badia
                 and Jesus Labarta",
  title =        "A Dynamic Load Balancing Approach with
                 {SMPSuperscalar} and {MPI}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7174",
  pages =        "10--23",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30397-5_2",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:25:38 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30397-5_2/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30397-5",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30397-5",
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Garland:2012:DUP,
  author =       "Michael Garland and Manjunath Kudlur and Yili Zheng",
  title =        "Designing a unified programming model for
                 heterogeneous machines",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "67:1--67:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a064.pdf",
  abstract =     "While high-efficiency machines are increasingly
                 embracing heterogeneous architectures and massive
                 multithreading, contemporary mainstream programming
                 languages reflect a mental model in which processing
                 elements are homogeneous, concurrency is limited, and
                 memory is a flat undifferentiated pool of storage.
                 Moreover, the current state of the art in programming
                 heterogeneous machines tends towards using separate
                 programming models, such as OpenMP and CUDA, for
                 different portions of the machine. Both of these
                 factors make programming emerging heterogeneous
                 machines unnecessarily difficult. We describe the
                 design of the Phalanx programming model, which seeks to
                 provide a unified programming model for heterogeneous
                 machines. It provides constructs for bulk parallelism,
                 synchronization, and data placement which operate
                 across the entire machine. Our prototype implementation
                 is able to launch and coordinate work on both CPU and
                 GPU processors within a single node, and by leveraging
                 the GASNet runtime, is able to run across all the nodes
                 of a distributed-memory machine.",
  acknowledgement = ack-nhfb,
  articleno =    "67",
}

@Article{Ghosh:2012:RAA,
  author =       "Sudeep Ghosh and Jason Hiser and Jack W. Davidson",
  title =        "Replacement attacks against {VM}-protected
                 applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "203--214",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151051",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Process-level virtualization is increasingly being
                 used to enhance the security of software applications
                 from reverse engineering and unauthorized modification
                 (called software protection). Process-level virtual
                 machines (PVMs) can safeguard the application code at
                 run time and hamper the adversary's ability to launch
                 dynamic attacks on the application. This dynamic
                 protection, combined with its flexibility, ease in
                 handling legacy systems and low performance overhead,
                 has made process-level virtualization a popular
                 approach for providing software protection. While there
                 has been much research on using process-level
                 virtualization to provide such protection, there has
                 been less research on attacks against PVM-protected
                 software. In this paper, we describe an attack on
                 applications protected using process-level
                 virtualization, called a replacement attack. In a
                 replacement attack, the adversary replaces the
                 protecting PVM with an attack VM thereby rendering the
                 application vulnerable to analysis and modification. We
                 present a general description of the replacement attack
                 methodology and two attack implementations against a
                 protected application using freely available tools. The
                 generality and simplicity of replacement attacks
                 demonstrates that there is a strong need to develop
                 techniques that meld applications more tightly to the
                 protecting PVM to prevent such attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Gong:2012:OCN,
  author =       "Yifan Gong and Bingsheng He and Jianlong Zhong",
  title =        "An overview of {CMPI}: network performance aware {MPI}
                 in the cloud",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "297--298",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145862",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Cloud computing enables users to perform distributed
                 computing tasks on many virtual machines, without
                 owning a physical cluster. Recently, various
                 distributed computing tasks such as scientific
                 applications are being moved from supercomputers and
                 private clusters to public clouds. Message passing
                 interface (MPI) is a key and common component in
                 distributed computing tasks. The virtualized computing
                 environment of the public cloud hides the network
                 topology information from the users, and existing
                 topology-aware optimizations for MPI are no longer
                 feasible in the cloud environment. We propose a network
                 performance aware MPI library named CMPI. CMPI embraces
                 a new model for capturing the network performance among
                 different virtual machines in the cloud. Based on the
                 network performance model, we develop novel network
                 performance aware algorithms for communication
                 operations. This poster gives an overview of CMPI
                 design, and presents some preliminary results on
                 collective operations such as broadcast.We demonstrate
                 the effectiveness of our network performance aware
                 optimizations on Amazon EC2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Gravvanis:2012:SFD,
  author =       "G. A. Gravvanis and C. K. Filelis-Papadopoulos and K.
                 M. Giannoutakis",
  title =        "Solving finite difference linear systems on {GPUs}:
                 {CUDA} based Parallel Explicit Preconditioned
                 Biconjugate Conjugate Gradient type Methods",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "61",
  number =       "3",
  pages =        "590--604",
  month =        sep,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Oct 26 07:41:53 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=61&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=61&issue=3&spage=590",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Gropp:2012:AMI,
  author =       "William Gropp and Ewing Lusk and Rajeev Thakur",
  title =        "Advanced {MPI} Including New {MPI-3} Features",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "14--14",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/accesspage/chapter/10.1007/978-3-642-33518-1_5",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gropp:2012:MBW,
  author =       "William Gropp",
  title =        "{MPI 3} and Beyond: Why {MPI} Is Successful and What
                 Challenges It Faces",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "1--9",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_1",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_1/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hermanns:2012:SDM,
  author =       "Marc-Andr{\'e} Hermanns and Markus Geimer and Bernd
                 Mohr and Felix Wolf",
  title =        "Scalable detection of {MPI-2} remote memory access
                 inefficiency patterns",
  journal =      j-IJHPCA,
  volume =       "26",
  number =       "3",
  pages =        "227--236",
  month =        aug,
  year =         "2012",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342011406758",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Thu Nov 8 11:31:14 MST 2012",
  bibsource =    "http://hpc.sagepub.com/content/26/3.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/26/3/227.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "June 8, 2011",
}

@InProceedings{Hilbrich:2012:MRE,
  author =       "Tobias Hilbrich and Joachim Protze and Martin Schulz
                 and Bronis R. de Supinski and Matthias S. M{\"u}ller",
  title =        "{MPI} runtime error detection with {MUST}: advances in
                 deadlock detection",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "30:1--30:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a010.pdf",
  abstract =     "The widely used Message Passing Interface (MPI) is
                 complex and rich. As a result, application developers
                 require automated tools to avoid and to detect MPI
                 programming errors. We present the Marmot Umpire
                 Scalable Tool (MUST) that detects such errors with
                 significantly increased scalability. We present
                 improvements to our graph-based deadlock detection
                 approach for MPI, which cover future MPI extensions.
                 Our enhancements also check complex MPI constructs that
                 no previous graph-based detection approach handled
                 correctly. Finally, we present optimizations for the
                 processing of MPI operations that reduce runtime
                 deadlock detection overheads. Existing approaches often
                 require O ( p ) analysis time per MPI operation, for p
                 processes. We empirically observe that our improvements
                 lead to sub-linear or better analysis time per
                 operation for a wide range of real world
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
}

@Article{Hoefler:2012:LMO,
  author =       "Torsten Hoefler and James Dinan and Darius Buntinas
                 and Pavan Balaji and Brian W. Barrett",
  title =        "Leveraging {MPI}'s One-Sided Communication Interface
                 for Shared-Memory Programming",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "132--141",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_18",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_18/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Hoefler:2012:OPC,
  author =       "Torsten Hoefler and Timo Schneider",
  title =        "Optimization principles for collective neighborhood
                 communications",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "98:1--98:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a028.pdf",
  abstract =     "Many scientific applications operate in a
                 bulk-synchronous mode of iterative communication and
                 computation steps. Even though the communication steps
                 happen at the same logical time, important patterns
                 such as stencil computations cannot be expressed as
                 collective communications in MPI. We demonstrate how
                 neighborhood collective operations allow to specify
                 arbitrary collective communication relations during
                 run-time and enable optimizations similar to
                 traditional collective calls. We show a number of
                 optimization opportunities and algorithms for different
                 communication scenarios. We also show how users can
                 assert constraints that provide additional optimization
                 opportunities in a portable way. We demonstrate the
                 utility of all described optimizations in a highly
                 optimized implementation of neighborhood collective
                 operations. Our communication and protocol
                 optimizations result in a performance improvement of up
                 to a factor of two for small stencil communications. We
                 found that, for some patterns, our optimization
                 heuristics automatically generate communication
                 schedules that are comparable to hand-tuned
                 collectives. With those optimizations in place, we are
                 able to accelerate arbitrary collective communication
                 patterns, such as regular and irregular stencils with
                 optimization methods for collective communications. We
                 expect that our methods will influence the design of
                 future MPI libraries and provide a significant
                 performance benefit on large-scale systems.",
  acknowledgement = ack-nhfb,
  articleno =    "98",
}

@Article{Hori:2012:EKL,
  author =       "Atsushi Hori and Toyohisa Kameyama and Yuichi Tsujita
                 and Mitaro Namiki and Yutaka Ishikawa",
  title =        "An Efficient Kernel-Level Blocking {MPI}
                 Implementation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "153--162",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_20",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_20/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Hormati:2012:SPS,
  author =       "Amir H. Hormati and Mehrzad Samadi and Mark Woh and
                 Trevor Mudge and Scott Mahlke",
  title =        "{Sponge}: portable stream programming on graphics
                 engines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "381--392",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics processing units (GPUs) provide a low cost
                 platform for accelerating high performance
                 computations. The introduction of new programming
                 languages, such as CUDA and OpenCL, makes GPU
                 programming attractive to a wide variety of
                 programmers. However, programming GPUs is still a
                 cumbersome task for two primary reasons: tedious
                 performance optimizations and lack of portability.
                 First, optimizing an algorithm for a specific GPU is a
                 time-consuming task that requires a thorough
                 understanding of both the algorithm and the underlying
                 hardware. Unoptimized CUDA programs typically only
                 achieve a small fraction of the peak GPU performance.
                 Second, GPU code lacks efficient portability as code
                 written for one GPU can be inefficient when executed on
                 another. Moving code from one GPU to another while
                 maintaining the desired performance is a non-trivial
                 task often requiring significant modifications to
                 account for the hardware differences. In this work, we
                 propose Sponge, a compilation framework for GPUs using
                 synchronous data flow streaming languages. Sponge is
                 capable of performing a wide variety of optimizations
                 to generate efficient code for graphics engines. Sponge
                 alleviates the problems associated with current GPU
                 programming methods by providing portability across
                 different generations of GPUs and CPUs, and a better
                 abstraction of the hardware details, such as the memory
                 hierarchy and threading model. Using streaming, we
                 provide a write-once software paradigm and rely on the
                 compiler to automatically create optimized CUDA code
                 for a wide variety of GPU targets. Sponge's compiler
                 optimizations improve the performance of the baseline
                 CUDA implementations by an average of 3.2x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hosking:2012:CHL,
  author =       "Tony Hosking",
  title =        "Compiling a high-level language for {GPUs}: (via
                 language support for architectures and compilers)",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "1--12",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254066",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Languages such as OpenCL and CUDA offer a standard
                 interface for general-purpose programming of GPUs.
                 However, with these languages, programmers must
                 explicitly manage numerous low-level details involving
                 communication and synchronization. This burden makes
                 programming GPUs difficult and error-prone, rendering
                 these powerful devices inaccessible to most
                 programmers. We desire a higher-level programming model
                 that makes GPUs more accessible while also effectively
                 exploiting their computational power. This paper
                 presents features of Lime, a new Java-compatible
                 language targeting heterogeneous systems, that allow an
                 optimizing compiler to generate high quality GPU code.
                 The key insight is that the language type system
                 enforces isolation and immutability invariants that
                 allow the compiler to optimize for a GPU without heroic
                 compiler analysis. Our compiler attains GPU speedups
                 between 75\% and 140\% of the performance of native
                 OpenCL code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Hursey:2012:AFA,
  author =       "Joshua Hursey and Richard L. Graham",
  title =        "Analyzing fault aware collective performance in a
                 process fault tolerant {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "1--2",
  pages =        "15--25",
  month =        jan # "\slash " # feb,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.10.010",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:36 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111001414",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Book{Hwu:2012:GCG,
  editor =       "Wen-mei Hwu",
  title =        "{GPU} computing gems",
  publisher =    "Morgan Kaufmann",
  address =      "Boston, MA",
  edition =      "Jade",
  pages =        "xvi + 541 + 16",
  year =         "2012",
  ISBN =         "0-12-385963-8 (hardback)",
  ISBN-13 =      "978-0-12-385963-1 (hardback)",
  LCCN =         "T385 .G6875 2012",
  bibdate =      "Sat Feb 8 18:16:05 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Applications of GPU computing series",
  abstract =     "Since the introduction of CUDA in 2007, more than 100
                 million computers with CUDA capable GPUs have been
                 shipped to end users. GPU computing application
                 developers can now expect their application to have a
                 mass market. With the introduction of OpenCL in 2010,
                 researchers can now expect to develop GPU applications
                 that can run on hardware from multiple vendors.",
  acknowledgement = ack-nhfb,
  subject =      "Graphics processing units; Programming; Imaging
                 systems; Computer graphics; Image processing; Digital
                 techniques",
  tableofcontents = "Part 1: Parallel Algorithms and Data Structures ---
                 Paulius Micikevicius, NVIDIA \\
                 1 Large-Scale GPU Search \\
                 2 Edge v. Node Parallelism for Graph Centrality Metrics
                 \\
                 3 Optimizing parallel prefix operations for the Fermi
                 architecture \\
                 4 Building an Efficient Hash Table on the GPU \\
                 5 An Efficient CUDA Algorithm for the Maximum Network
                 Flow Problem \\
                 6 On Improved Memory Access Patterns for Cellular
                 Automata Using CUDA \\
                 7 Fast Minimum Spanning Tree Computation on Large
                 Graphs \\
                 8 Fast in-place sorting with CUDA based on bitonic sort
                 \\
                 Part 2: Numerical Algorithms --- Frank Jargstorff,
                 NVIDIA \\
                 9 Interval Arithmetic in CUDA \\
                 10 Approximating the erfinv Function \\
                 11 A Hybrid Method for Solving Tridiagonal Systems on
                 the GPU \\
                 12 LU Decomposition in CULA \\
                 13 GPU Accelerated Derivative-free Optimization \\
                 Part 3: Engineering Simulation --- Peng Wang, NVIDIA
                 \\
                 14 Large-scale gas turbine simulations on GPU clusters
                 \\
                 15 GPU acceleration of rarefied gas dynamic simulations
                 \\
                 16 Assembly of Finite Element Methods on Graphics
                 Processors \\
                 17 CUDA implementation of Vertex-Centered, Finite
                 Volume CFD methods on Unstructured Grids with Flow
                 Control Applications \\
                 18 Solving Wave Equations on Unstructured Geometries
                 \\
                 19 Fast electromagnetic integral equation solvers on
                 graphics processing units (GPUs) \\
                 Part 4: Interactive Physics for Games and Engineering
                 Simulation --- Richard Tonge, NVIDIA \\
                 20 Solving Large Multi-Body Dynamics Problems on the
                 GPU \\
                 21 Implicit FEM Solver in CUDA \\
                 22 Real-time Adaptive GPU multi-agent path planning \\
                 Part 5: Computational Finance --- Thomas Bradley,
                 NVIDIA \\
                 23 High performance finite difference PDE solvers on
                 GPUs for financial option pricing \\
                 24 Identifying and Mitigating Credit Risk using
                 Large-scale Economic Capital Simulations \\
                 25 Financial Market Value-at-Risk Estimation using the
                 Monte Carlo Method \\
                 Part 6: Programming Tools and Techniques --- Cliff
                 Wooley, NVIDIA \\
                 26 Thrust: A Productivity-Oriented Library for CUDA \\
                 27 GPU Scripting and Code Generation with PyCUDA \\
                 28 Jacket: GPU Powered MATLAB Acceleration \\
                 29 Accelerating Development and Execution Speed with
                 Just In Time GPU Code Generation \\
                 30 GPU Application Development, Debugging, and
                 Performance Tuning with GPU Ocelot \\
                 31 Abstraction for AoS and SoA Layout in C++ \\
                 32 Processing Device Arrays with C++ Metaprogramming
                 \\
                 33 GPU Metaprogramming: A Case Study in
                 Biologically-Inspired Machine Vision \\
                 34 A Hybridization Methodology for High-Performance
                 Linear Algebra Software for GPUs \\
                 35 Dynamic Load Balancing using Work-Stealing \\
                 36 Applying software-managed caching and CPU/GPU task
                 scheduling for accelerating dynamic workloads",
}

@Article{Jiang:2012:OSP,
  author =       "Lei Jiang and Pragneshkumar B. Patel and George
                 Ostrouchov and Ferdinand Jamitzky",
  title =        "{OpenMP}-style parallelism in data-centered multicore
                 computing with {R}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "335--336",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145882",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "R$^1$ is a domain specific language widely used for
                 data analysis by the statistics community as well as by
                 researchers in finance, biology, social sciences, and
                 many other disciplines. As R programs are linked to
                 input data, the exponential growth of available data
                 makes high-performance computing with R imperative. To
                 ease the process of writing parallel programs in R,
                 code transformation from a sequential program to a
                 parallel version would bring much convenience to R
                 users. In this paper, we present our work in
                 semi-automatic parallelization of R codes with
                 user-added OpenMP-style pragmas. While such pragmas are
                 used at the frontend, we take advantage of multiple
                 parallel backends with different R packages. We provide
                 flexibility for importing parallelism with plug-in
                 components, impose built-in MapReduce for data
                 processing, and also maintain code reusability. We
                 illustrate the advantage of the on-the-fly mechanisms
                 which can lead to significant applications in
                 data-centered parallel computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Johnson:2012:FOL,
  author =       "Tim Johnson and Pierre Fite-Georgel and Rahul Raguram
                 and Jan-Michael Frahm",
  title =        "Fast Organization of Large Photo Collections Using
                 {CUDA}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "6554",
  pages =        "463--476",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-35740-4_36",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Dec 24 08:20:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-642-35740-4_36",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-35740-4",
  book-URL =     "http://www.springerlink.com/content/978-3-642-35740-4",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kakimoto:2012:PCG,
  author =       "Takeshi Kakimoto and Keisuke Dohi and Yuichiro Shibata
                 and Kiyoshi Oguri",
  title =        "Performance comparison of {GPU} programming frameworks
                 with the striped {Smith--Waterman} algorithm",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "70--75",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460229",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This paper evaluates and discusses how different GPU
                 programming frameworks affect the performance obtained
                 from GPU acceleration of the striped smith-waterman
                 algorithm used for biological sequence alignment. A
                 total of 6 GPU implementations of the algorithm on
                 NVIDIA GT200b and AMD RV870 using the CUDA and the
                 OpenCL frameworks are compared to analyze cons and pros
                 of explicit descriptions for architecture specific
                 hardware mechanisms in the code. The evaluation results
                 show that the primitive descriptions with the CUDA are
                 still efficient especially for small size data, while
                 better instruction scheduling and optimizations are
                 carried out by the OpenCL compiler. On the other hand,
                 the combination of OpenCL and RV870 which provides a
                 relatively simple view of the architecture is efficient
                 for the large data size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Kanal:2012:MMC,
  author =       "M. E. Kanal and M. Demiralp",
  title =        "A modified method of calculating {High Dimensional
                 Model Representation (HDMR) Terms} for parallelization
                 with {MPI} and {CUDA}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "62",
  number =       "1",
  pages =        "199--213",
  month =        oct,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Oct 26 07:42:33 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=62&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=62&issue=1&spage=199",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Kanal:2012:PAI,
  author =       "M. E. Kanal",
  title =        "Parallel algorithm on inversion for adjacent
                 pentadiagonal matrices with {MPI}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "59",
  number =       "2",
  pages =        "1071--1078",
  month =        feb,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Apr 6 17:44:43 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=59&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=59&issue=2&spage=1071",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Karrenberg:2012:IPO,
  author =       "Ralf Karrenberg and Sebastian Hack",
  title =        "Improving Performance of {OpenCL} on {CPUs}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7210",
  pages =        "1--20",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-28652-0_1",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:26:22 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-28652-0_1/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-28652-0",
  book-URL =     "http://www.springerlink.com/content/978-3-642-28652-0",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kim:2012:OUP,
  author =       "Jungwon Kim and Sangmin Seo and Jun Lee and Jeongho
                 Nah and Gangwon Jo and Jaejin Lee",
  title =        "{OpenCL} as a unified programming model for
                 heterogeneous {CPU\slash GPU} clusters",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "299--300",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145863",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we propose an OpenCL framework for
                 heterogeneous CPU/GPU clusters, and show that the
                 framework achieves both high performance and ease of
                 programming. The framework provides an illusion of a
                 single system for the user. It allows the application
                 to utilize multiple heterogeneous compute devices, such
                 as multicore CPUs and GPUs, in a remote node as if they
                 were in a local node. No communication API, such as the
                 MPI library, is required in the application source. We
                 implement the OpenCL framework and evaluate its
                 performance on a heterogeneous CPU/GPU cluster that
                 consists of one host node and nine compute nodes using
                 eleven OpenCL benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Kjolstad:2012:ADG,
  author =       "Fredrik Kjolstad and Torsten Hoefler and Marc Snir",
  title =        "Automatic datatype generation and optimization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "327--328",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145878",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Many high performance applications spend considerable
                 time packing noncontiguous data into contiguous
                 communication buffers. MPI Datatypes provide an
                 alternative by describing noncontiguous data layouts.
                 This allows sophisticated hardware to retrieve data
                 directly from application data structures. However,
                 packing codes in real-world applications are often
                 complex and specifying equivalent datatypes is
                 difficult, time-consuming, and error prone. We present
                 an algorithm that automates the transformation. We have
                 implemented the algorithm in a tool that transforms
                 packing code to MPI Datatypes, and evaluated it by
                 transforming 90 packing codes from the NAS Parallel
                 Benchmarks. The transformation allows easy porting of
                 applications to new machines that benefit from
                 datatypes, thus improving programmer productivity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Klemm:2012:EOV,
  author =       "Michael Klemm and Alejandro Duran and Xinmin Tian and
                 Hideki Saito and Diego Caballero",
  title =        "Extending {OpenMP*} with Vector Constructs for Modern
                 Multicore {SIMD} Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "59--72",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_5/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Klockner:2012:PPS,
  author =       "Andreas Kl{\"o}ckner and Nicolas Pinto and Yunsup Lee
                 and Bryan Catanzaro and Paul Ivanov and Ahmed Fasih",
  title =        "{PyCUDA} and {PyOpenCL}: a scripting-based approach to
                 {GPU} run-time code generation",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "3",
  pages =        "157--174",
  month =        mar,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.09.001",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:36 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111001281",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Kwon:2012:HAO,
  author =       "Okwan Kwon and Fahed Jubair and Rudolf Eigenmann and
                 Samuel Midkiff",
  title =        "A hybrid approach of {OpenMP} for clusters",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "75--84",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145827",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We present the first fully automated compiler-runtime
                 system that successfully translates and executes OpenMP
                 shared-address-space programs on laboratory-size
                 clusters, for the complete set of regular, repetitive
                 applications in the NAS Parallel Benchmarks. We
                 introduce a hybrid compiler-runtime translation scheme.
                 Compared to previous work, this scheme features a new
                 runtime data flow analysis and new compiler techniques
                 for improving data affinity and reducing communication
                 costs. We present and discuss the performance of our
                 translated programs, and compare them with the
                 performance of the MPI, HPF and UPC versions of the
                 benchmarks. The results show that our translated
                 programs achieve 75\% of the hand-coded MPI programs,
                 on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Lashuk:2012:MPA,
  author =       "Ilya Lashuk and Aparna Chandramowlishwaran and Harper
                 Langston and Tuan-Anh Nguyen and Rahul Sampath and
                 Aashay Shringarpure and Richard Vuduc and Lexing Ying
                 and Denis Zorin and George Biros",
  title =        "A massively parallel adaptive fast multipole method on
                 heterogeneous architectures",
  journal =      j-CACM,
  volume =       "55",
  number =       "5",
  pages =        "101--109",
  month =        may,
  year =         "2012",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/2160718.2160740",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Wed May 9 07:19:14 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/cacm/;
                 http://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 http://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "We describe a parallel fast multipole method (FMM) for
                 highly nonuniform distributions of particles. We employ
                 both distributed memory parallelism (via MPI) and
                 shared memory parallelism (via OpenMP and GPU
                 acceleration) to rapidly evaluate two-body
                 nonoscillatory potentials in three dimensions on
                 heterogeneous high performance computing architectures.
                 We have performed scalability tests with up to 30
                 billion particles on 196,608 cores on the
                 AMD/CRAY-based Jaguar system at ORNL. On a GPU-enabled
                 system (NSF's Keeneland at Georgia Tech/ORNL), we
                 observed 30$ \times $ speedup over a single core CPU
                 and 7$ \times $ speedup over a multicore CPU
                 implementation. By combining GPUs with MPI, we achieve
                 less than 10 ns/particle and six digits of accuracy for
                 a run with 48 million nonuniformly distributed
                 particles on 192 GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@InProceedings{Lee:2012:EED,
  author =       "Seyong Lee and Jeffrey S. Vetter",
  title =        "Early evaluation of directive-based {GPU} programming
                 models for productive exascale computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "23:1--23:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a051.pdf",
  abstract =     "Graphics Processing Unit (GPU)-based parallel computer
                 architectures have shown increased popularity as a
                 building block for high performance computing, and
                 possibly for future Exascale computing. However, their
                 programming complexity remains as a major hurdle for
                 their widespread adoption. To provide better
                 abstractions for programming GPU architectures,
                 researchers and vendors have proposed several
                 directive-based GPU programming models. These
                 directive-based models provide different levels of
                 abstraction, and required different levels of
                 programming effort to port and optimize applications.
                 Understanding these differences among these new models
                 provides valuable insights on their applicability and
                 performance potential. In this paper, we evaluate
                 existing directive-based models by porting thirteen
                 application kernels from various scientific domains to
                 use CUDA GPUs, which, in turn, allows us to identify
                 important issues in the functionality, scalability,
                 tunability, and debuggability of the existing models.
                 Our evaluation shows that directive-based models can
                 achieve reasonable performance, compared to
                 hand-written GPU codes.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
}

@InProceedings{Lee:2012:SMO,
  author =       "Jaejin Lee",
  editor =       "????",
  booktitle =    "{ATIP '12: Proceedings of the ATIP\slash A*CRC
                 Workshop on Accelerator Technologies for
                 High-Performance Computing: Does Asia Lead the Way?}",
  title =        "{SnuCL} and an {MPI $+$ OpenCL} implementation of
                 {HPL} on heterogeneous {CPU\slash GPU} clusters",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "??--??",
  year =         "2012",
  ISBN =         "1-4503-1644-1",
  ISBN-13 =      "978-1-4503-1644-6",
  LCCN =         "????",
  bibdate =      "Wed Nov 14 11:00:18 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  remark =       "Publisher has only PDF of 28 lecture slides",
}

@InProceedings{Levesque:2012:HEA,
  author =       "John M. Levesque and Ramanan Sankaran and Ray Grout",
  title =        "Hybridizing {S3D} into an exascale application using
                 {OpenACC}: an approach for moving to multi-petaflops
                 and beyond",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "15:1--15:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a040.pdf",
  abstract =     "Hybridization is the process of converting an
                 application with a single level of parallelism to an
                 application with multiple levels of parallelism. Over
                 the past 15 years a majority of the applications that
                 run on High Performance Computing systems have employed
                 MPI for all of the parallelism within the application.
                 In the Peta-Exascale computing regime, effective
                 utilization of the hardware requires multiple levels of
                 parallelism matched to the macro architecture of the
                 system to achieve good performance. A hybridized code
                 base is performance portable when sufficient
                 parallelism is expressed in an architecture agnostic
                 form to achieve good performance on a range of
                 available systems. The hybridized S3D code is
                 performance portable across today's leading many core
                 and GPU accelerated systems. The OpenACC framework
                 allows a unified code base to be deployed for either
                 (Manycore CPU or Manycore CPU+GPU) while permitting
                 architecture specific optimizations to expose new
                 dimensions of parallelism to be utilized.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
}

@InProceedings{Li:2012:PFA,
  author =       "Peng Li and Guodong Li and Ganesh Gopalakrishnan",
  title =        "Parametric flows: automated behavior equivalencing for
                 symbolic analysis of races in {CUDA} programs",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "29:1--29:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a009.pdf",
  abstract =     "The growing scale of concurrency requires automated
                 abstraction techniques to cut down the effort in
                 concurrent system analysis. In this paper, we show that
                 the high degree of behavioral symmetry present in GPU
                 programs allows CUDA race detection to be dramatically
                 simplified through abstraction. Our abstraction
                 techniques is one of automatically creating parametric
                 flows ---control-flow equivalence classes of threads
                 that diverge in the same manner---and checking for data
                 races only across a pair of threads per parametric
                 flow. We have implemented this approach as an extension
                 of our recently proposed GKLEE symbolic analysis
                 framework and show that all our previous results are
                 dramatically improved in that (i) the parametric
                 flow-based analysis takes far less time, and (ii)
                 because of the much higher scalability of the analysis,
                 we can detect even more data race situations that were
                 previously missed by GKLEE because it was forced to
                 downscale examples to limit analysis complexity.
                 Moreover, the parametric flow-based analysis is
                 applicable to other programs with SPMD models.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
}

@Article{Lima:2012:PEO,
  author =       "Antonio M. Lima and Marco A. S. Netto and Thais Webber
                 and Ricardo M. Czekster and Cesar A. F. {De Rose} and
                 Paulo Fernandes",
  title =        "Performance evaluation of {OpenMP}-based algorithms
                 for handling {Kronecker} descriptors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "72",
  number =       "5",
  pages =        "678--692",
  month =        may,
  year =         "2012",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2012.02.001",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 28 08:37:48 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731512000354",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Ling:2012:HPP,
  author =       "Cheng Ling and Khaled Benkrid and Tsuyoshi Hamada",
  title =        "High performance phylogenetic analysis on
                 {CUDA}-compatible {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "52--57",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460226",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "The operation of phylogenetic analysis aims to
                 investigate the evolution and relationships among
                 species. It is widely used in the fields of system
                 biology and comparative genomics. However, phylogenetic
                 analysis is also a computationally intensive operation
                 as the number of tree topology grows in a factorial way
                 with the number of species involved. Therefore, due to
                 the large number of species in the real world, the
                 computational burden has largely thwarted phylogenetic
                 reconstruction. In this paper, we describe the detailed
                 GPU-based multi-threaded design and implementation of a
                 Markov Chain Monte Carlo (MCMC) maximum likelihood
                 algorithm for phylogenetic analysis on a set of aligned
                 nucleotide sequences. The implementation is based on
                 the framework of the most widely used phylogenetic
                 analysis tool, namely MrBayes. The proposed approach
                 resulted in 6x-8x speed-up on an NVidia Geforce 460 GTX
                 GPU compared to an optimized GPP-based software
                 implementation running on a desktop computer with a
                 single Intel Xeon 2.53 GHz CPU and 6.0 GB RAM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Maheo:2012:AOL,
  author =       "Aur{\`e}le Mah{\'e}o and Souad Kolia{\"\i} and Patrick
                 Carribault and Marc P{\'e}rache and William Jalby",
  title =        "Adaptive {OpenMP} for Large {NUMA} Nodes",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "254--257",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_20",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_20/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mainland:2012:EHM,
  author =       "Geoffrey Mainland",
  title =        "Explicitly heterogeneous metaprogramming with
                 {MetaHaskell}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "311--322",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364572",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Languages with support for metaprogramming, like
                 MetaOCaml, offer a principled approach to code
                 generation by guaranteeing that well-typed metaprograms
                 produce well-typed programs. However, many problem
                 domains where metaprogramming can fruitfully be applied
                 require generating code in languages like C, CUDA, or
                 assembly. Rather than resorting to add-hoc code
                 generation techniques, these applications should be
                 directly supported by explicitly heterogeneous
                 metaprogramming languages. We present MetaHaskell, an
                 extension of Haskell 98 that provides modular syntactic
                 and type system support for type safe metaprogramming
                 with multiple object languages. Adding a new object
                 language to MetaHaskell requires only minor
                 modifications to the host language to support
                 type-level quantification over object language types
                 and propagation of type equality constraints. We
                 demonstrate the flexibility of our approach through
                 three object languages: a core ML language, a linear
                 variant of the core ML language, and a subset of C. All
                 three languages support metaprogramming with open terms
                 and guarantee that well-typed MetaHaskell programs will
                 only produce closed object terms that are well-typed.
                 The essence of MetaHaskell is captured in a type system
                 for a simplified metalanguage. MetaHaskell, as well as
                 all three object languages, are fully implemented in
                 the mhc bytecode compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Malits:2012:ELG,
  author =       "Roman Malits and Evgeny Bolotin and Avinoam Kolodny
                 and Avi Mendelson",
  title =        "Exploring the limits of {GPGPU} scheduling in control
                 flow bound applications",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086708",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPGPUs are optimized for graphics, for that reason the
                 hardware is optimized for massively data parallel
                 applications characterized by predictable memory access
                 patterns and little control flow. For such
                 applications' e.g., matrix multiplication, GPGPU based
                 system can achieve very high performance. However, many
                 general purpose data parallel applications are
                 characterized as having intensive control flow and
                 unpredictable memory access patterns. Optimizing the
                 code in such problems for current hardware is often
                 ineffective and even impractical since it exhibits low
                 hardware utilization leading to relatively low
                 performance. This work tracks the root causes of
                 execution inefficacies when running control flow
                 intensive CUDA applications on NVIDIA GPGPU hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Marongiu:2012:OCE,
  author =       "Andrea Marongiu and Luca Benini",
  title =        "An {OpenMP} Compiler for Efficient Use of Distributed
                 Scratchpad Memory in {MPSoCs}",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "61",
  number =       "2",
  pages =        "222--236",
  month =        feb,
  year =         "2012",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2010.199",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Fri Jan 13 17:55:10 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Martins:2012:PDC,
  author =       "Wellington S. Martins and Thiago F. Rangel",
  title =        "Phylogenetic Distance Computation Using {CUDA}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7409",
  pages =        "168--178",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-31927-3_15",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:21:56 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012g.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-31927-3_15/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-31927-3",
  book-URL =     "http://www.springerlink.com/content/978-3-642-31927-3",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Massetto:2012:NSB,
  author =       "Francisco Isidro Massetto and Liria Matsumoto Sato and
                 Kuan-Ching Li",
  title =        "A novel strategy for building interoperable {MPI}
                 environment in heterogeneous high performance systems",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "60",
  number =       "1",
  pages =        "87--116",
  month =        apr,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Apr 6 17:45:24 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=60&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=60&issue=1&spage=87",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Mehta:2012:SPE,
  author =       "Kshitij Mehta and Edgar Gabriel and Barbara Chapman",
  title =        "Specification and Performance Evaluation of Parallel
                 {I/O} Interfaces for {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "1--14",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_1",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_1/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mittal:2012:CAS,
  author =       "Anshul Mittal and Nikhil Jain and Thomas George and
                 Yogish Sabharwal and Sameer Kumar",
  title =        "Collective algorithms for sub-communicators",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "315--316",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145872",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Collective communication over a group of processors is
                 an integral and time consuming component in many HPC
                 applications. Many modern day supercomputers are based
                 on torus interconnects. On such systems, for an
                 irregular communicator comprising of a subset of
                 processors, the algorithms developed so far are not
                 contention free in general and hence non-optimal. In
                 this paper, we present a novel contention-free
                 algorithm to perform collective operations over a
                 subset of processors in a torus network. We also extend
                 previous work on regular communicators to handle
                 special cases of irregular communicators that occur
                 frequently in parallel scientific applications. For the
                 generic case where multiple node disjoint
                 sub-communicators communicate simultaneously in a
                 loosely synchronous fashion, we propose a novel
                 cooperative approach to route the data for individual
                 sub-communicators without contention. Empirical results
                 demonstrate that our algorithms outperform the
                 optimized MPI collective implementation on IBM's Blue
                 Gene/P supercomputer for large data sizes and random
                 node distributions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Muller:2012:SOA,
  author =       "Matthias S. M{\"u}ller and John Baron and William C.
                 Brantley and Huiyu Feng and Daniel Hackenberg",
  title =        "{SPEC OMP2012}--- An Application Benchmark Suite for
                 Parallel Systems Using {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "223--236",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_17",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_17/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Neuberger:2012:MIS,
  author =       "John M. Neuberger and N{\'a}ndor Sieben and James W.
                 Swift",
  title =        "An {MPI} Implementation of a Self-Submitting Parallel
                 Job Queue",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "40",
  number =       "4",
  pages =        "443--464",
  month =        aug,
  year =         "2012",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Fri Oct 26 07:12:55 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=40&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=40&issue=4&spage=443",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Ng:2012:STT,
  author =       "Nicholas Ng and Nobuko Yoshida and Xin Yu Niu and Kuen
                 Hung Tsoi",
  title =        "Session types: towards safe and fast reconfigurable
                 programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "22--27",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460221",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This paper introduces a new programming framework
                 based on the theory of session types for safe,
                 reconfigurable parallel designs. We apply the session
                 type theory to C and Java programming languages and
                 demonstrate that the session-based languages can offer
                 a clear and tractable framework to describe
                 communications between parallel components and
                 guarantee communication-safety and deadlock-freedom by
                 compile-time type checking. Many representative
                 communication topologies such as a ring or
                 scatter-gather can be programmed and verified in
                 session-based programming languages. Case studies
                 involving N-body simulation and Kmeans clustering are
                 used to illustrate the session-based programming style
                 and to demonstrate that the session-based languages
                 perform competitively against MPI counterparts in an
                 FPGA-based heterogeneous cluster, as well as the
                 potential of integrating them with FPGA acceleration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@InProceedings{Nguyen:2012:BTM,
  author =       "Tan Nguyen and Pietro Cicotti and Eric Bylaska and Dan
                 Quinlan and Scott B. Baden",
  title =        "{Bamboo}: translating {MPI} applications to a
                 latency-tolerant, data-driven form",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "39:1--39:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a032.pdf",
  abstract =     "We present Bamboo, a custom source-to-source
                 translator that transforms MPI C source into a
                 data-driven form that automatically overlaps
                 communication with available computation. Running on up
                 to 98304 processors of NERSC's Hopper system, we
                 observe that Bamboo's overlap capability speeds up MPI
                 implementations of a 3D Jacobi iterative solver and
                 Cannon's matrix multiplication. Bamboo's generated code
                 meets or exceeds the performance of hand optimized MPI,
                 which includes split-phase coding, the method
                 classically employed to hide communication. We achieved
                 our results with only modest amounts of programmer
                 annotation and no intrusive reprogramming of the
                 original application source.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
}

@Article{Nguyen:2012:SCS,
  author =       "Donald Nguyen and Keshav Pingali",
  title =        "Synthesizing concurrent schedulers for irregular
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "333--344",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950404",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scheduling is the assignment of tasks or activities to
                 processors for execution, and it is an important
                 concern in parallel programming. Most prior work on
                 scheduling has focused either on static scheduling of
                 applications in which the dependence graph is known at
                 compile-time or on dynamic scheduling of independent
                 loop iterations such as in OpenMP. In irregular
                 algorithms, dependences between activities are complex
                 functions of runtime values so these algorithms are not
                 amenable to compile-time analysis nor do they consist
                 of independent activities. Moreover, the amount of work
                 can vary dramatically with the scheduling policy. To
                 handle these complexities, implementations of irregular
                 algorithms employ carefully handcrafted,
                 algorithm-specific schedulers but these schedulers are
                 themselves parallel programs, complicating the parallel
                 programming problem further. In this paper, we present
                 a flexible and efficient approach for specifying and
                 synthesizing scheduling policies for irregular
                 algorithms. We develop a simple compositional
                 specification language and show how it can concisely
                 encode scheduling policies in the literature. Then, we
                 show how to synthesize efficient parallel schedulers
                 from these specifications. We evaluate our approach for
                 five irregular algorithms on three multicore
                 architectures and show that (1) the performance of some
                 algorithms can improve by orders of magnitude with the
                 right scheduling policy, and (2) for the same policy,
                 the overheads of our synthesized schedulers are
                 comparable to those of fixed-function schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Nobari:2012:SPM,
  author =       "Sadegh Nobari and Thanh-Tung Cao and Panagiotis Karras
                 and St{\'e}phane Bressan",
  title =        "Scalable parallel minimum spanning forest
                 computation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "205--214",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145842",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The proliferation of data in graph form calls for the
                 development of scalable graph algorithms that exploit
                 parallel processing environments. One such problem is
                 the computation of a graph's minimum spanning forest
                 (MSF). Past research has proposed several parallel
                 algorithms for this problem, yet none of them scales to
                 large, high-density graphs. In this paper we propose a
                 novel, scalable, parallel MSF algorithm for undirected
                 weighted graphs. Our algorithm leverages Prim's
                 algorithm in a parallel fashion, concurrently expanding
                 several subsets of the computed MSF. Our effort focuses
                 on minimizing the communication among different
                 processors without constraining the local growth of a
                 processor's computed subtree. In effect, we achieve a
                 scalability that previous approaches lacked. We
                 implement our algorithm in CUDA, running on a GPU and
                 study its performance using real and synthetic, sparse
                 as well as dense, structured and unstructured graph
                 data. Our experimental study demonstrates that our
                 algorithm outperforms the previous state-of-the-art
                 GPU-based MSF algorithm, while being several orders of
                 magnitude faster than sequential CPU-based
                 algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Notz:2012:GBS,
  author =       "Patrick K. Notz and Roger P. Pawlowski and James C.
                 Sutherland",
  title =        "Graph-Based Software Design for Managing Complexity
                 and Enabling Concurrency in Multiphysics {PDE}
                 Software",
  journal =      j-TOMS,
  volume =       "39",
  number =       "1",
  pages =        "1:1--1:21",
  month =        nov,
  year =         "2012",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2382585.2382586",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Dec 6 07:36:30 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "Multiphysics simulation software is plagued by
                 complexity stemming from nonlinearly coupled systems of
                 Partial Differential Equations (PDEs). Such software
                 typically supports many models, which may require
                 different transport equations, constitutive laws, and
                 equations of state. Strong coupling and a multiplicity
                 of models leads to complex algorithms (i.e., the
                 properly ordered sequence of steps to assemble a
                 discretized set of coupled PDEs) and rigid software.
                 This work presents a design strategy that shifts focus
                 away from high-level algorithmic concerns to low-level
                 data dependencies. Mathematical expressions are
                 represented as software objects that directly expose
                 data dependencies. The entire system of expressions
                 forms a directed acyclic graph and the high-level
                 assembly algorithm is generated automatically through
                 standard graph algorithms. This approach makes problems
                 with complex dependencies entirely tractable, and
                 removes virtually all logic from the algorithm itself.
                 Changes are highly localized, allowing developers to
                 implement models without detailed understanding of any
                 algorithms (i.e., the overall assembly process).
                 Furthermore, this approach complements existing
                 MPI-based frameworks and can be implemented within them
                 easily. Finally, this approach enables algorithmic
                 parallelization via threads. By exposing dependencies
                 in the algorithm explicitly, thread-based parallelism
                 is implemented through algorithm decomposition,
                 providing a basis for exploiting parallelism
                 independent from domain decomposition approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@InProceedings{Nukada:2012:SMG,
  author =       "Akira Nukada and Kento Sato and Satoshi Matsuoka",
  title =        "Scalable multi-{GPU} {$3$-D} {FFT} for {TSUBAME 2.0}
                 supercomputer",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "44:1--44:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a044.pdf",
  abstract =     "For scalable 3-D FFT computation using multiple GPUs,
                 efficient all-to-all communication between GPUs is the
                 most important factor in good performance.
                 Implementations with point-to-point MPI library
                 functions and CUDA memory copy APIs typically exhibit
                 very large overheads especially for small message sizes
                 in all-to-all communications between many nodes. We
                 propose several schemes to minimize the overheads,
                 including employment of lower-level API of InfiniBand
                 to effectively overlap intra- and inter-node
                 communication, as well as auto-tuning strategies to
                 control scheduling and determine rail assignments. As a
                 result we achieve very good strong scalability as well
                 as good performance, up to 4.8TFLOPS using 256 nodes of
                 TSUBAME 2.0 Supercomputer (768 GPUs) in double
                 precision.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
}

@Article{OBroin:2012:OIS,
  author =       "Cathal {{\'O} Broin} and L. A. A. Nikolopoulos",
  title =        "An {OpenCL} implementation for the solution of the
                 time-dependent {Schr{\"o}dinger} equation on {GPUs} and
                 {CPUs}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "10",
  pages =        "2071--2080",
  month =        oct,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2012.05.009",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu Jun 28 15:53:26 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465512001774",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Oh:2012:MOO,
  author =       "Kwang Jin Oh and Ji Hoon Kang and Hun Joo Myung",
  title =        "{mm\_par2.0}: An object-oriented molecular dynamics
                 simulation program parallelized using a hierarchical
                 scheme with {MPI} and {OPENMP}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "2",
  pages =        "440--441",
  month =        feb,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2011.08.023",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 10:11:01 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465511003407",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Oliveira:2012:CCO,
  author =       "Rafael Sachetto Oliveira and Bernardo Martins Rocha
                 and Ronan Mendon{\c{c}}a Amorim",
  title =        "Comparing {CUDA}, {OpenCL} and {OpenGL}
                 Implementations of the Cardiac Monodomain Equations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7204",
  pages =        "111--120",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-31500-8_12",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:26:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012c.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-31500-8_12/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-31500-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-31500-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Olivier:2012:CMW,
  author =       "Stephen L. Olivier and Bronis R. de Supinski and
                 Martin Schulz and Jan F. Prins",
  title =        "Characterizing and mitigating work time inflation in
                 task parallel programs",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "65:1--65:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a066.pdf",
  abstract =     "Task parallelism raises the level of abstraction in
                 shared memory parallel programming to simplify the
                 development of complex applications. However, task
                 parallel applications can exhibit poor performance due
                 to thread idleness, scheduling overheads, and work time
                 inflation --- additional time spent by threads in a
                 multithreaded computation beyond the time required to
                 perform the same work in a sequential computation. We
                 identify the contributions of each factor to lost
                 efficiency in various task parallel OpenMP applications
                 and diagnose the causes of work time inflation in those
                 applications. Increased data access latency can cause
                 significant work time inflation in NUMA systems. Our
                 locality framework for task parallel OpenMP programs
                 mitigates this cause of work time inflation. Our
                 extensions to the Qthreads library demonstrate that
                 locality-aware scheduling can improve performance up to
                 3X compared to the Intel OpenMP task scheduler.",
  acknowledgement = ack-nhfb,
  articleno =    "65",
}

@Article{Olivier:2012:OTS,
  author =       "Stephen L. Olivier and Allan K. Porterfield and Kyle
                 B. Wheeler and Michael Spiegel and Jan F. Prins",
  title =        "{OpenMP} task scheduling strategies for multicore
                 {NUMA} systems",
  journal =      j-IJHPCA,
  volume =       "26",
  number =       "2",
  pages =        "110--124",
  month =        may,
  year =         "2012",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342011434065",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Thu Nov 8 11:31:13 MST 2012",
  bibsource =    "http://hpc.sagepub.com/content/26/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/26/2/110.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "February 7, 2012",
}

@Article{Perla:2012:PAH,
  author =       "Francesca Perla and Paolo Zanetti",
  title =        "Performance Analysis of an Hybrid {MPI\slash OpenMP}
                 {ALM} Software for Life Insurance Policies on
                 Multi-core Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "250--253",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_19",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_19/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Preissl:2012:CSS,
  author =       "Robert Preissl and Theodore M. Wong and Pallab Datta
                 and Myron Flickner and Raghavendra Singh and Steven K.
                 Esser and William P. Risk and Horst D. Simon and
                 Dharmendra S. Modha",
  title =        "{Compass}: a scalable simulator for an architecture
                 for cognitive computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "54:1--54:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a085.pdf",
  abstract =     "Inspired by the function, power, and volume of the
                 organic brain, we are developing TrueNorth, a novel
                 modular, non-von Neumann, ultra-low power, compact
                 architecture. TrueNorth consists of a scalable network
                 of neurosynaptic cores, with each core containing
                 neurons, dendrites, synapses, and axons. To set sail
                 for TrueNorth, we developed Compass, a multi-threaded,
                 massively parallel functional simulator and a parallel
                 compiler that maps a network of long-distance pathways
                 in the macaque monkey brain to TrueNorth. We
                 demonstrate near-perfect weak scaling on a 16 rack
                 IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB
                 memory), achieving an unprecedented scale of 256
                 million neurosynaptic cores containing 65 billion
                 neurons and 16 trillion synapses running only 388X
                 slower than real time with an average spiking rate of
                 8.1 Hz. By using emerging PGAS communication
                 primitives, we also demonstrate 2X better real-time
                 performance over MPI primitives on a 4 rack Blue Gene/P
                 (16384 CPUs, 16 TB memory).",
  acknowledgement = ack-nhfb,
  articleno =    "54",
}

@Article{Puzniakowski:2012:TOI,
  author =       "Tadeusz Pu{\'z}niakowski and Marek A. Bednarczyk",
  title =        "Towards an {OpenCL} Implementation of `Genetic
                 Algorithms' on {GPUs}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7053",
  pages =        "190--203",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-25261-7_15",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:16 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-25261-7_15/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-25261-7",
  book-URL =     "http://www.springerlink.com/content/978-3-642-25261-7",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Qiu:2012:PWM,
  author =       "Judy Qiu and Seung-Hee Bae",
  title =        "Performance of windows multicore systems on threading
                 and {MPI}",
  journal =      j-CCPE,
  volume =       "24",
  number =       "1",
  pages =        "14--28",
  month =        jan,
  year =         "2012",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1762",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Jan 16 12:11:17 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "23 May 2011",
}

@InProceedings{Rietmann:2012:FAS,
  author =       "Max Rietmann and Peter Messmer and Tarje Nissen-Meyer
                 and Daniel Peter and Piero Basini and Dimitri
                 Komatitsch and Olaf Schenk and Jeroen Tromp and Lapo
                 Boschi and Domenico Giardini",
  title =        "Forward and adjoint simulations of seismic wave
                 propagation on emerging large-scale {GPU}
                 architectures",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "38:1--38:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a104.pdf",
  abstract =     "Computational seismology is an area of wide
                 sociological and economic impact, ranging from
                 earthquake risk assessment to subsurface imaging and
                 oil and gas exploration. At the core of these
                 simulations is the modeling of wave propagation in a
                 complex medium. Here we report on the extension of the
                 high-order finite-element seismic wave simulation
                 package SPECFEM3D to support the largest scale hybrid
                 and homogeneous supercomputers. Starting from an
                 existing highly tuned MPI code, we migrated to a CUDA
                 version. In order to be of immediate impact to the
                 science mission of computational seismologists, we had
                 to port the entire production package, rather than just
                 individual kernels. One of the challenges in
                 parallelizing finite element codes is the potential for
                 race conditions during the assembly phase. We therefore
                 investigated different methods such as mesh coloring or
                 atomic updates on the GPU. In order to achieve strong
                 scaling, we needed to ensure good overlap of data
                 motion at all levels, including internode and
                 host-accelerator transfers. Finally we carefully tuned
                 the GPU implementation. The new MPI/CUDA solver
                 exhibits excellent scalability and achieves speedup on
                 a node-to-node basis over the carefully tuned
                 equivalent multi-core MPI solver. To demonstrate the
                 performance of both the forward and adjoint
                 functionality, we present two case studies run on the
                 Cray XE6 CPU and Cray XK6 GPU architectures up to 896
                 nodes: (1) focusing on most commonly used forward
                 simulations, we simulate seismic wave propagation
                 generated by earthquakes in Turkey, and (2) testing the
                 most complex seismic inversion type of the package, we
                 use ambient seismic noise to image 3-D crust and mantle
                 structure beneath western Europe.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
}

@Article{Royuela:2012:ASO,
  author =       "Sara Royuela and Alejandro Duran and Chunhua Liao and
                 Daniel J. Quinlan",
  title =        "Auto-scoping for {OpenMP} Tasks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "29--43",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_3",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_3/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Rubio-Largo:2012:UMO,
  author =       "{\'A}lvaro Rubio-Largo and Miguel A.
                 Vega-Rodr{\'\i}guez and Juan A. G{\'o}mez-Pulido",
  title =        "Using a Multiobjective {OpenMP+MPI DE} for the Static
                 {RWA} Problem",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "6927",
  pages =        "224--231",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-27549-4_29",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Dec 24 07:13:54 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-642-27549-4_29",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-27549-4",
  book-URL =     "http://www.springerlink.com/content/978-3-642-27549-4",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sabne:2012:ECO,
  author =       "Amit Sabne and Putt Sakdhnagool and Rudolf Eigenmann",
  title =        "Effects of Compiler Optimizations in {OpenMP} to
                 {CUDA} Translation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "169--181",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_13",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_13/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Samadi:2012:AIA,
  author =       "Mehrzad Samadi and Amir Hormati and Mojtaba Mehrara
                 and Janghaeng Lee and Scott Mahlke",
  title =        "Adaptive input-aware compilation for graphics
                 engines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "13--22",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254067",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "While graphics processing units (GPUs) provide
                 low-cost and efficient platforms for accelerating high
                 performance computations, the tedious process of
                 performance tuning required to optimize applications is
                 an obstacle to wider adoption of GPUs. In addition to
                 the programmability challenges posed by GPU's complex
                 memory hierarchy and parallelism model, a well-known
                 application design problem is target portability across
                 different GPUs. However, even for a single GPU target,
                 changing a program's input characteristics can make an
                 already-optimized implementation of a program perform
                 poorly. In this work, we propose Adaptic, an adaptive
                 input-aware compilation system to tackle this
                 important, yet overlooked, input portability problem.
                 Using this system, programmers develop their
                 applications in a high-level streaming language and let
                 Adaptic undertake the difficult task of input portable
                 optimizations and code generation. Several input-aware
                 optimizations are introduced to make efficient use of
                 the memory hierarchy and customize thread composition.
                 At runtime, a properly optimized version of the
                 application is executed based on the actual program
                 input. We perform a head-to-head comparison between the
                 Adaptic generated and hand-optimized CUDA programs. The
                 results show that Adaptic is capable of generating
                 codes that can perform on par with their hand-optimized
                 counterparts over certain input ranges and outperform
                 them when the input falls out of the hand-optimized
                 programs' ``comfort zone''. Furthermore, we show that
                 input-aware results are sustainable across different
                 GPU targets making it possible to write and optimize
                 applications once and run them anywhere.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Santos:2012:ICC,
  author =       "Bruno F. L. Santos and Hendrik T. Macedo",
  title =        "Improving {CUDA{\TM} C\slash C++} encoding readability
                 to foster parallel application development",
  journal =      j-SIGSOFT,
  volume =       "37",
  number =       "1",
  pages =        "1--5",
  month =        jan,
  year =         "2012",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/2088883.2088897",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:16:09 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib",
  abstract =     "Graphical Processing Units (GPUs) have recently been
                 used to enable parallel application development. The
                 most prominent initiative has been provided by
                 NVIDIA{\TM} with the so-called CUDA{\TM} architecture,
                 designed to GeForce{\TM} graphic cards. However, even
                 with CUDA C-like programming language, parallel
                 codification remains somewhat awkward if compared to
                 sequential codification. The programmer still has to
                 deal with low-level hardware details such as generation
                 and synchronization of threads and GPU tracks and
                 sectors. In this paper, we propose a
                 programmer-friendly interface for CUDA-C programming,
                 in such a way that most hardware details are hidden
                 from the programmer. We show how code readability is
                 improved without undermining parallel execution
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Satake:2012:OGA,
  author =       "Shin-ichi Satake and Hajime Yoshimori and Takayuki
                 Suzuki",
  title =        "Optimizations of a {GPU} accelerated heat conduction
                 equation by a programming of {CUDA Fortran} from an
                 analysis of a {PTX} file",
  journal =      j-COMP-PHYS-COMM,
  volume =       "183",
  number =       "11",
  pages =        "2376--2385",
  month =        nov,
  year =         "2012",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2012.06.005",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jul 27 07:00:54 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465512002068",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@InProceedings{Schindewolf:2012:WSA,
  author =       "Martin Schindewolf and Barna Bihari and John
                 Gyllenhaal and Martin Schulz and Amy Wang and Wolfgang
                 Karl",
  title =        "What scientific applications can benefit from hardware
                 transactional memory?",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "90:1--90:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a073.pdf",
  abstract =     "Achieving efficient and correct synchronization of
                 multiple threads is a difficult and error-prone task at
                 small scale and, as we march towards extreme scale
                 computing, will be even more challenging when the
                 resulting application is supposed to utilize millions
                 of cores efficiently. Transactional Memory (TM) is a
                 promising technique to ease the burden on the
                 programmer, but only recently has become available on
                 commercial hardware in the new Blue Gene/Q system and
                 hence the real benefit for realistic applications has
                 not been studied yet. This paper presents the first
                 performance results of TM embedded into OpenMP on a
                 prototype system of BG/Q and characterizes code
                 properties that will likely lead to benefits when
                 augmented with TM primitives. We first study the
                 influence of thread count, environment variables and
                 memory layout on TM performance and identify code
                 properties that will yield performance gains with TM.
                 Second, we evaluate the combination of OpenMP with
                 multiple synchronization primitives on top of MPI to
                 determine suitable task to thread ratios per node.
                 Finally, we condense our findings into a set of best
                 practices. These are applied to a Monte Carlo Benchmark
                 and a Smoothed Particle Hydrodynamics method. In both
                 cases an optimized TM version, executed with 64 threads
                 on one node, outperforms a simple TM implementation.
                 MCB with optimized TM yields a speedup of 27.45 over
                 baseline.",
  acknowledgement = ack-nhfb,
  articleno =    "90",
}

@Article{Schmidl:2012:PAT,
  author =       "Dirk Schmidl and Peter Philippen and Daniel Lorenz and
                 Christian R{\"o}ssel and Markus Geimer",
  title =        "Performance Analysis Techniques for Task-Based
                 {OpenMP} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "196--209",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_15",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_15/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Schneider:2012:MAC,
  author =       "Timo Schneider and Robert Gerstenberger and Torsten
                 Hoefler",
  title =        "Micro-applications for Communication Data Access
                 Patterns and {MPI} Datatypes",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "121--131",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_17",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_17/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sehrish:2012:RFS,
  author =       "Saba Sehrish and Jun Wang",
  title =        "{Reduced Function Set Abstraction (RFSA)} for
                 {MPI-IO}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "59",
  number =       "1",
  pages =        "131--146",
  month =        jan,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 13 15:25:33 MST 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=59&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=59&issue=1&spage=131",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Shan:2012:OAA,
  author =       "Hongzhang Shan and Erich Strohmaier and James Amundson
                 and Eric G. Stern",
  title =        "Optimizing the Advanced Accelerator Simulation
                 Framework {Synergia} Using {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "140--153",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_11",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_11/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shan:2012:PEH,
  author =       "Hongzhang Shan and Nicholas J. Wright and John Shalf
                 and Katherine Yelick and Marcus Wagner and Nathan
                 Wichmann",
  title =        "A preliminary evaluation of the hardware acceleration
                 of the {Cray Gemini} interconnect for {PGAS} languages
                 and comparison with {MPI}",
  journal =      j-SIGMETRICS,
  volume =       "40",
  number =       "2",
  pages =        "92--98",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2381056.2381077",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Nov 9 11:06:40 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "The Gemini interconnect on the Cray XE6 platform
                 provides for lightweight remote direct memory access
                 (RDMA) between nodes, which is useful for implementing
                 partitioned global address space (PGAS) languages like
                 UPC and Co-Array Fortran. In this paper, we perform a
                 study of Gemini performance using a set of
                 communication microbenchmarks and compare the
                 performance of one-sided communication in PGAS
                 languages with two-sided MPI. Our results demonstrate
                 the performance benefits of the PGAS model on Gemini
                 hardware, showing in what circumstances and by how much
                 one-sided communication outperforms two-sided in terms
                 of messaging rate, aggregate bandwidth, and computation
                 and communication overlap capability. For example, for
                 8-byte and 2KB messages the one-sided messaging rate is
                 5 and 10 times greater respectively than the two-sided
                 one. The study also reveals important information about
                 how to optimize one-sided Gemini communication.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Sharma:2012:SRP,
  author =       "Subodh Sharma and Ganesh Gopalakrishnan",
  title =        "A Sound Reduction of Persistent-Sets for Deadlock
                 Detection in {MPI} Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7498",
  pages =        "194--209",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33296-8_15",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:52 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33296-8_15/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33296-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33296-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shi:2012:VGA,
  author =       "Lin Shi and Hao Chen and Jianhua Sun and Kenli Li",
  title =        "{vCUDA}: {GPU}-Accelerated High-Performance Computing
                 in Virtual Machines",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "61",
  number =       "6",
  pages =        "804--816",
  month =        jun,
  year =         "2012",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2011.112",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Fri Jul 27 08:32:31 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@InProceedings{Speck:2012:MST,
  author =       "R. Speck and D. Ruprecht and R. Krause and M. Emmett
                 and M. Minion and M. Winkel and P. Gibbon",
  title =        "A massively space-time parallel {$N$}-body solver",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "92:1--92:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a083.pdf",
  abstract =     "We present a novel space-time parallel version of the
                 Barnes--Hut tree code pepc using pfasst, the Parallel
                 Full Approximation Scheme in Space and Time. The naive
                 use of increasingly more processors for a fixed-size
                 N-body problem is prone to saturate as soon as the
                 number of unknowns per core becomes too small. To
                 overcome this intrinsic strong-scaling limit, we
                 introduce temporal parallelism on top of pepc's
                 existing hybrid MPI/PThreads spatial decomposition.
                 Here, we use pfasst which is based on a combination of
                 the iterations of the parallel-in-time algorithm
                 parareal with the sweeps of spectral deferred
                 correction (SDC) schemes. By combining these sweeps
                 with multiple space-time discretization levels, pfasst
                 relaxes the theoretical bound on parallel efficiency in
                 parareal. We present results from runs on up to 262,144
                 cores on the IBM Blue Gene/P installation JUGENE,
                 demonstrating that the space-time parallel code
                 provides speedup beyond the saturation of the purely
                 space-parallel approach.",
  acknowledgement = ack-nhfb,
  articleno =    "92",
}

@Article{Steinberger:2012:SDS,
  author =       "Markus Steinberger and Bernhard Kainz and Bernhard
                 Kerbl and Stefan Hauswiesner and Michael Kenzel and
                 Dieter Schmalstieg",
  title =        "{Softshell}: dynamic scheduling on {GPUs}",
  journal =      j-TOG,
  volume =       "31",
  number =       "6",
  pages =        "161:1--161:??",
  month =        nov,
  year =         "2012",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/2366145.2366180",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Thu Nov 15 16:10:28 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "In this paper we present Softshell, a novel execution
                 model for devices composed of multiple processing cores
                 operating in a single instruction, multiple data
                 fashion, such as graphics processing units (GPUs). The
                 Softshell model is intuitive and more flexible than the
                 kernel-based adaption of the stream processing model,
                 which is currently the dominant model for general
                 purpose GPU computation. Using the Softshell model,
                 algorithms with a relatively low local degree of
                 parallelism can execute efficiently on massively
                 parallel architectures. Softshell has the following
                 distinct advantages: (1) work can be dynamically issued
                 directly on the device, eliminating the need for
                 synchronization with an external source, i.e., the CPU;
                 (2) its three-tier dynamic scheduler supports arbitrary
                 scheduling strategies, including dynamic priorities and
                 real-time scheduling; and (3) the user can influence,
                 pause, and cancel work already submitted for parallel
                 execution. The Softshell processing model thus brings
                 capabilities to GPU architectures that were previously
                 only known from operating-system designs and reserved
                 for CPU programming. As a proof of our claims, we
                 present a publicly available implementation of the
                 Softshell processing model realized on top of CUDA. The
                 benchmarks of this implementation demonstrate that our
                 processing model is easy to use and also performs
                 substantially better than the state-of-the-art
                 kernel-based processing model for problems that have
                 been difficult to parallelize in the past.",
  acknowledgement = ack-nhfb,
  articleno =    "161",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Strzodka:2012:DLO,
  author =       "Robert Strzodka",
  title =        "Data layout optimization for multi-valued containers
                 in {OpenCL}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "72",
  number =       "9",
  pages =        "1073--1082",
  month =        sep,
  year =         "2012",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2011.10.012",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 27 06:43:44 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731511002115",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Su:2012:CPB,
  author =       "ChunYi Su and Dong Li and Dimitrios S. Nikolopoulos
                 and Matthew Grove and Kirk Cameron and Bronis R. de
                 Supinski",
  title =        "Critical path-based thread placement for {NUMA}
                 systems",
  journal =      j-SIGMETRICS,
  volume =       "40",
  number =       "2",
  pages =        "106--112",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2381056.2381079",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Nov 9 11:06:40 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "Multicore multiprocessors use a Non Uniform Memory
                 Architecture (NUMA) to improve their scalability.
                 However, NUMA introduces performance penalties due to
                 remote memory accesses. Without efficiently managing
                 data layout and thread mapping to cores, scientific
                 applications may suffer performance loss, even if they
                 are optimized for NUMA. In this paper, we present
                 algorithms and a runtime system that optimize the
                 execution of OpenMP applications on NUMA architectures.
                 By collecting information from hardware counters, the
                 runtime system directs thread placement and reduces
                 performance penalties by minimizing the critical path
                 of OpenMP parallel regions. The runtime system uses a
                 scalable algorithm that derives placement decisions
                 with negligible overhead. We evaluate our algorithms
                 and the runtime system with four NPB applications
                 implemented in OpenMP. On average the algorithms
                 achieve between 8.13\% and 25.68\% performance
                 improvement, compared to the default Linux thread
                 placement scheme. The algorithms miss the optimal
                 thread placement in only 8.9\% of the cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@InProceedings{Subramoni:2012:DSI,
  author =       "H. Subramoni and S. Potluri and K. Kandalla and B.
                 Barth and J. Vienne and J. Keasler and K. Tomko and K.
                 Schulz and A. Moody and D. K. Panda",
  title =        "Design of a scalable {InfiniBand} topology service to
                 enable network-topology-aware placement of processes",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "70:1--70:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a076.pdf",
  abstract =     "Over the last decade, InfiniBand has become an
                 increasingly popular interconnect for deploying modern
                 super-computing systems. However, there exists no
                 detection service that can discover the underlying
                 network topology in a scalable manner and expose this
                 information to runtime libraries and users of the high
                 performance computing systems in a convenient way. In
                 this paper, we design a novel and scalable method to
                 detect the InfiniBand network topology by using
                 Neighbor-Joining techniques (NJ). To the best of our
                 knowledge, this is the first instance where the
                 neighbor joining algorithm has been applied to solve
                 the problem of detecting InfiniBand network topology.
                 We also design a network-topology-aware MPI library
                 that takes advantage of the network topology service.
                 The library places processes taking part in the MPI job
                 in a network-topology-aware manner with the dual aim of
                 increasing intra-node communication and reducing the
                 long distance inter-node communication across the
                 InfiniBand fabric.",
  acknowledgement = ack-nhfb,
  articleno =    "70",
}

@Article{Sumimoto:2012:MCL,
  author =       "Shinji Sumimoto",
  title =        "The {MPI Communication Library} for the {K} Computer:
                 Its Design and Implementation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "11--11",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_3",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/accesspage/chapter/10.1007/978-3-642-33518-1_3",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tahan:2012:ITC,
  author =       "Oussama Tahan and Mats Brorsson and Mohamed Shawky",
  title =        "Introducing Task Cancellation to {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "73--87",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_6",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_6/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tahan:2012:UDT,
  author =       "Oussama Tahan and Mohamed Shawky",
  title =        "Using Dynamic Task Level Redundancy for {OpenMP} Fault
                 Tolerance",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7179",
  pages =        "25--36",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-28293-5_3",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:25:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-28293-5_3/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-28293-5",
  book-URL =     "http://www.springerlink.com/content/978-3-642-28293-5",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tao:2012:UGA,
  author =       "Jian Tao and Marek Blazewicz and Steven R. Brandt",
  title =        "Using {GPU}'s to accelerate stencil-based computation
                 kernels for the development of large scale scientific
                 applications on heterogeneous systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "287--288",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145857",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We present CaCUDA --- a GPGPU kernel abstraction and a
                 parallel programming framework for developing highly
                 efficient large scale scientific applications using
                 stencil computations on hybrid CPU/GPU architectures.
                 CaCUDA is built upon the Cactus computational toolkit,
                 an open source problem solving environment designed for
                 scientists and engineers. Due to the flexibility and
                 extensibility of the Cactus toolkit, the addition of a
                 GPGPU programming framework required no changes to the
                 Cactus infrastructure, guaranteeing that existing
                 features and modules will continue to work without
                 modification. CaCUDA was tested and benchmarked using a
                 3D CFD code based on a finite difference discretization
                 of Navier--Stokes equations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Terboven:2012:AOT,
  author =       "Christian Terboven and Dirk Schmidl and Tim Cramer and
                 Dieter an Mey",
  title =        "Assessing {OpenMP} Tasking Implementations on {NUMA}
                 Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "182--195",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_14",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_14/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Thibault:2012:AIF,
  author =       "Julien C. Thibault and Inanc Senocak",
  title =        "Accelerating incompressible flow computations with a
                 {Pthreads--CUDA} implementation on small-footprint
                 multi-{GPU} platforms",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "59",
  number =       "2",
  pages =        "693--719",
  month =        feb,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Apr 6 17:44:43 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=59&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=59&issue=2&spage=693",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Thoman:2012:AOL,
  author =       "Peter Thoman and Herbert Jordan and Simone Pellegrini
                 and Thomas Fahringer",
  title =        "Automatic {OpenMP} Loop Scheduling: a Combined
                 Compiler and Runtime Approach",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "88--101",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_7",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_7/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Thorson:2012:SUF,
  author =       "Greg Thorson and Michael Woodacre",
  title =        "{SGI UV2}: a fused computation and data analysis
                 machine",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "105:1--105:??",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a080.pdf",
  abstract =     "UV2 is SGI's second generation data fusion system. UV2
                 was designed to meet the latest challenges facing users
                 in computation and data analysis. Its unique ability to
                 perform both functions on a single platform enables
                 efficient, easy to manage workflows. This platform has
                 a hybrid infrastructure, leveraging the latest
                 Intel\reg{} EP processors providing industry leading
                 computational power. Due to its high bandwidth,
                 extremely low latency NUMALink\reg{}6 (NL6)
                 interconnect, plus vectorized synchronization and data
                 movement, UV2 provides industry leading data intensive
                 capability. It supports a single operating system (OS)
                 image up to 64TB and 4K threads. Multiple OS images can
                 be deployed on a single NL6 fabric, which has a single
                 flat address space up to 8PB and 256K threads. These
                 capabilities allow for extreme performance on a broad
                 range of programming models and languages including
                 OpenMP[1], MPI, UPC[2], CAF[3] and SHMEM. The
                 architecture, implementation and performance of UV2 are
                 detailed.",
  acknowledgement = ack-nhfb,
  articleno =    "105",
}

@Article{Traff:2012:AUE,
  author =       "Jesper Larsson Tr{\"a}ff",
  title =        "Alternative, uniformly expressive and more scalable
                 interfaces for collective communication in {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "1--2",
  pages =        "26--36",
  month =        jan # "\slash " # feb,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2011.10.009",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 15:17:36 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819111001402",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Traff:2012:MTM,
  author =       "Jesper Larsson Tr{\"a}ff",
  title =        "{{\tt mpicroscope}}: Towards an {MPI} Benchmark Tool
                 for Performance Guideline Verification",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "100--109",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_15",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_15/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tsutsui:2012:AMG,
  author =       "Shigeyoshi Tsutsui",
  title =        "{ACO} on Multiple {GPUs} with {CUDA} for Faster
                 Solution of {QAPs}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7492",
  pages =        "174--184",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-32964-7_18",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:44 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-32964-7_18/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-32964-7",
  book-URL =     "http://www.springerlink.com/content/978-3-642-32964-7",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Tu:2012:PAO,
  author =       "Bibo Tu and Jianping Fan and Jianfeng Zhan and
                 Xiaofang Zhao",
  title =        "Performance analysis and optimization of {MPI}
                 collective operations on multi-core clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "60",
  number =       "1",
  pages =        "141--162",
  month =        apr,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Apr 6 17:45:24 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=60&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=60&issue=1&spage=141",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Unat:2012:AFD,
  author =       "Didem Unat and Jun Zhou and Yifeng Cui and Scott B.
                 Baden and Xing Cai",
  title =        "Accelerating a {43$D$} Finite-Difference Earthquake
                 Simulation with a {C-to-CUDA} Translator",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "14",
  number =       "3",
  pages =        "48--59",
  month =        may # "\slash " # jun,
  year =         "2012",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2012.44",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu Apr 26 17:01:57 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Urena:2012:IMI,
  author =       "Isa{\'\i}as A. Compr{\'e}s Ure{\~n}a and Michael
                 Riepen and Michael Konow and Michael Gerndt",
  title =        "Invasive {MPI} on {Intel}'s Single-Chip Cloud
                 Computer",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7179",
  pages =        "74--85",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-28293-5_7",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:25:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012b.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-28293-5_7/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-28293-5",
  book-URL =     "http://www.springerlink.com/content/978-3-642-28293-5",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wang:2012:OVT,
  author =       "Cheng Wang and Sunita Chandrasekaran and Barbara
                 Chapman",
  title =        "An {OpenMP 3.1} Validation Testsuite",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7312",
  pages =        "237--249",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8_18",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012e.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-30961-8_18/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-30961-8",
  book-URL =     "http://www.springerlink.com/content/978-3-642-30961-8",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wei:2012:OLL,
  author =       "Zheng Wei and Joseph Jaja",
  title =        "Optimization of Linked List Prefix Computations on
                 Multithreaded {GPUs} Using {CUDA}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "22",
  number =       "4",
  pages =        "1250012",
  month =        dec,
  year =         "2012",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626412500120",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Sat Jun 22 15:54:17 MDT 2013",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Wu:2012:DPL,
  author =       "Chao-Chin Wu and Chao-Tung Yang and Kuan-Chou Lai and
                 Po-Hsun Chiu",
  title =        "Designing parallel loop self-scheduling schemes using
                 the hybrid {MPI} and {OpenMP} programming model for
                 multi-core grid systems",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "59",
  number =       "1",
  pages =        "42--60",
  month =        jan,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 13 15:25:33 MST 2011",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=59&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=59&issue=1&spage=42",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Wu:2012:PCH,
  author =       "Xingfu Wu and Valerie Taylor",
  title =        "Performance Characteristics of Hybrid {MPI\slash
                 OpenMP} Implementations of {NAS Parallel Benchmarks}
                 {SP} and {BT} on Large-Scale Multicore Clusters",
  journal =      j-COMP-J,
  volume =       "55",
  number =       "2",
  pages =        "154--167",
  month =        feb,
  year =         "2012",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxr063",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Thu Feb 2 09:12:17 MST 2012",
  bibsource =    "http://comjnl.oxfordjournals.org/content/55/2.toc;
                 http://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://comjnl.oxfordjournals.org/content/55/2/154.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "July 18, 2011",
}

@Article{Wu:2012:UHM,
  author =       "Chao-Chin Wu and Lien-Fu Lai and Chao-Tung Yang and
                 Po-Hsun Chiu",
  title =        "Using hybrid {MPI} and {OpenMP} programming to
                 optimize communications in parallel loop
                 self-scheduling schemes for multicore {PC} clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "60",
  number =       "1",
  pages =        "31--61",
  month =        apr,
  year =         "2012",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Apr 6 17:45:24 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=60&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=60&issue=1&spage=31",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Yoshinaga:2012:DBM,
  author =       "Kazumi Yoshinaga and Yuichi Tsujita and Atsushi Hori
                 and Mikiko Sato and Mitaro Namiki",
  title =        "Delegation-Based {MPI} Communications for a Hybrid
                 Parallel Computer with Many-Core Architecture",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "47--56",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_10",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_10/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Yu:2012:SCC,
  author =       "Fang Yu and Shun-Ching Yang and Farn Wang and
                 Guan-Cheng Chen and Che-Chang Chan",
  title =        "Symbolic consistency checking of {OpenMP} parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "139--148",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248438",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "We present a symbolic approach for checking
                 consistency of OpenMP parallel programs. A parallel
                 program is consistent if it yields the same result as
                 its sequential version despite the execution order
                 among threads. We find race conditions of an OpenMP
                 parallel program, construct the formal model of its
                 raced segments under relaxed memory models, and perform
                 guided symbolic simulation to search consistency
                 violations. The simulation terminates when (1) a
                 witness has been found (the program is inconsistent),
                 or (2) all reachable states have been explored (the
                 program is consistent). We have developed the tool
                 Pathg by incorporating Omega library to solve race
                 constraints and Red symbolic simulator to perform
                 guided search. We show that Pathg can prove consistency
                 of programs, identify races that modern OpenMP checkers
                 failed to report, and find inconsistency witnesses
                 effectively against benchmarks from the OpenMP Source
                 Code Repository and the NAS Parallel benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Yuan:2012:PCS,
  author =       "Zhiyong Yuan and Weixin Si and Xiangyun Liao and
                 Zhaoliang Duan and Yihua Ding and Jianhui Zhao",
  title =        "Parallel computing of {$3$D} smoking simulation based
                 on {OpenCL} heterogeneous platform",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "61",
  number =       "1",
  pages =        "84--102",
  month =        jul,
  year =         "2012",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-011-0652-y",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Oct 26 07:41:32 MDT 2012",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=61&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=61&issue=1&spage=84",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Zahavi:2012:FTR,
  author =       "Eitan Zahavi",
  title =        "Fat-tree routing and node ordering providing
                 contention free traffic for {MPI} global collectives",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "72",
  number =       "11",
  pages =        "1423--1432",
  month =        nov,
  year =         "2012",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2012.01.018",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 12 12:11:36 MDT 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731512000305",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Zhao:2012:ASO,
  author =       "Xin Zhao and Gopalakrishnan Santhanaraman and William
                 Gropp",
  title =        "Adaptive Strategy for One-Sided Communication in
                 {MPICH2}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7490",
  pages =        "16--26",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1_7",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012h.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-33518-1_7/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-33518-1",
  book-URL =     "http://www.springerlink.com/content/978-3-642-33518-1",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Zhou:2012:DFD,
  author =       "Xu Zhou and Kai Lu and Xicheng Lu and Xiaoping Wang
                 and Baohua Fan",
  title =        "{dMPI}: Facilitating Debugging of {MPI} Programs via
                 Deterministic Message Passing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "7513",
  pages =        "172--179",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-35606-3_20",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Dec 19 15:24:06 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012i.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-642-35606-3_20/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-35606-3",
  book-URL =     "http://www.springerlink.com/content/978-3-642-35606-3",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Zhu:2012:CDS,
  author =       "Ke Zhu and Matthias Butenuth and Pablo d'Angelo",
  title =        "Comparison of Dense Stereo Using {CUDA}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "6554",
  pages =        "398--410",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-35740-4_31",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Dec 24 08:20:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs2012a.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/978-3-642-35740-4_31",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-642-35740-4",
  book-URL =     "http://www.springerlink.com/content/978-3-642-35740-4",
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Augusto:2013:APG,
  author =       "Douglas A. Augusto and Helio J. C. Barbosa",
  title =        "Accelerated parallel genetic programming tree
                 evaluation with {OpenCL}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "1",
  pages =        "86--100",
  month =        jan,
  year =         "2013",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2012.01.012",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Nov 17 07:06:13 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151200024X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Bach:2013:LQB,
  author =       "Matthias Bach and Volker Lindenstruth and Owe
                 Philipsen and Christopher Pinke",
  title =        "{Lattice QCD} based on {OpenCL}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "184",
  number =       "9",
  pages =        "2042--2052",
  month =        sep,
  year =         "2013",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Aug 26 14:34:22 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465513001288",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Bai:2013:SLA,
  author =       "Mingze Bai and Shixin Sun and Hong Tang and Yusheng
                 Dou and Glenn V. Lo",
  title =        "An {SPMD}-Like Algorithm for Parallelizing Molecular
                 Dynamics Using {OpenMP}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "15",
  number =       "4",
  pages =        "48--56",
  month =        jul # "\slash " # aug,
  year =         "2013",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2012.66",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Tue Dec 3 15:39:06 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Barkati:2013:SPA,
  author =       "Karim Barkati and Pierre Jouvelot",
  title =        "Synchronous programming in audio processing: a lookup
                 table oscillator case study",
  journal =      j-COMP-SURV,
  volume =       "46",
  number =       "2",
  pages =        "24:1--24:??",
  month =        nov,
  year =         "2013",
  CODEN =        "CMSVAN",
  DOI =          "https://doi.org/10.1145/2543581.2543591",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Thu Feb 6 07:35:29 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/surveys/;
                 http://www.math.utah.edu/pub/tex/bib/compsurv.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The adequacy of a programming language to a given
                 software project or application domain is often
                 considered a key factor of success in software
                 development and engineering, even though little
                 theoretical or practical information is readily
                 available to help make an informed decision. In this
                 article, we address a particular version of this issue
                 by comparing the adequacy of general-purpose
                 synchronous programming languages to more
                 Domain-Specific Languages (DSLs) in the field of
                 computer music. More precisely, we implemented and
                 tested the same lookup table oscillator example
                 program, one of the most classical algorithms for sound
                 synthesis, using a selection of significant synchronous
                 programming languages, half of which designed as
                 specific music languages-Csound, Pure Data,
                 SuperCollider, ChucK, Faust-and the other half being
                 general synchronous formalisms-Signal, Lustre, Esterel,
                 Lucid Synchrone and C with the OpenMP Stream Extension
                 (Matlab/Octave is used for the initial specification).
                 The advantages of these two approaches are discussed,
                 providing insights to language designers and possibly
                 software developers of both communities regarding
                 programming languages design for the audio domain.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
}

@Article{Berka:2013:CPC,
  author =       "Tobias Berka and Giorgos Kollias and Helge Hagenauer
                 and Marian Vajter{\v{s}}ic and Ananth Grama",
  title =        "Concurrent programming constructs for parallel {MPI}
                 applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "63",
  number =       "2",
  pages =        "385--406",
  month =        feb,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-011-0739-5",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Apr 1 14:50:44 MDT 2013",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=63&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-011-0739-5",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Bland:2013:EUL,
  author =       "Wesley Bland and Aurelien Bouteiller and Thomas
                 Herault and Joshua Hursey {\ldots}",
  title =        "An evaluation of {User-Level Failure Mitigation}
                 support in {MPI}",
  journal =      j-COMPUTING,
  volume =       "95",
  number =       "12",
  pages =        "1171--1184",
  month =        dec,
  year =         "2013",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-013-0331-3",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Wed Jan 29 10:10:11 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=95&issue=12;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-013-0331-3",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Bland:2013:PFR,
  author =       "Wesley Bland and Aurelien Bouteiller and Thomas
                 Herault and George Bosilca and Jack Dongarra",
  title =        "Post-failure recovery of {MPI} communication
                 capability: Design and rationale",
  journal =      j-IJHPCA,
  volume =       "27",
  number =       "3",
  pages =        "244--254",
  month =        aug,
  year =         "2013",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342013488238",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Mar 14 15:39:55 MDT 2014",
  bibsource =    "http://hpc.sagepub.com/content/27/3.toc;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/27/3/244.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "June 3, 2013",
}

@Article{Bland:2013:SIP,
  author =       "Wesley Bland and Peng Du and Aurelien Bouteiller and
                 Thomas Herault and George Bosilca and Jack J.
                 Dongarra",
  title =        "Special Issue Papers: Extending the scope of the
                 {Checkpoint-on-Failure} protocol for forward recovery
                 in standard {MPI}",
  journal =      j-CCPE,
  volume =       "25",
  number =       "17",
  pages =        "2381--2393",
  day =          "10",
  month =        dec,
  year =         "2013",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3100",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Dec 3 10:37:48 MST 2013",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "23 Jul 2013",
}

@Article{Buyukkececi:2013:POI,
  author =       "Ferit B{\"u}y{\"u}kke{\c{c}}eci and Omar Awile and Ivo
                 F. Sbalzarini",
  title =        "A portable {OpenCL} implementation of generic
                 particle-mesh and mesh-particle interpolation in {$2$D}
                 and {$3$D}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "39",
  number =       "2",
  pages =        "94--111",
  month =        feb,
  year =         "2013",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2012.12.001",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Feb 28 07:26:40 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819112000920",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@TechReport{Cao:2013:CHP,
  author =       "Chongxiao Cao and Jack Dongarra and Peng Du and Mark
                 Gates and Piotr Luszczek and Stanimire Tomov",
  title =        "{clMAGMA}: High Performance Dense Linear Algebra with
                 {OpenCL}",
  type =         "LAPACK Working Note",
  number =       "275",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "2013",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn275.pdf",
  acknowledgement = ack-nhfb,
  utknumber =    "UT-CS-13-706",
}

@Article{Chang:2013:PDS,
  author =       "Yao-Lin Chang and I-Lun Tseng",
  title =        "A parallel dual-scanline algorithm for partitioning
                 parameterized 45-degree polygons",
  journal =      j-TODAES,
  volume =       "18",
  number =       "4",
  pages =        "59:1--59:??",
  month =        oct,
  year =         "2013",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/2505015",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Fri Nov 8 11:45:54 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/todaes.bib",
  abstract =     "In order to use rectangular corner stitching data
                 structures in storing parameterized orthogonal layouts,
                 parameterized polygons in the layouts must be
                 partitioned into rectangles. Likewise, in order to use
                 trapezoidal corner stitching data structures in storing
                 parameterized 45-degree layouts, parameterized polygons
                 in the layouts have to be partitioned into trapezoids.
                 In this article, a parallel polygon partitioning
                 algorithm is proposed; the algorithm is capable of
                 partitioning parameterized orthogonal polygons into
                 parameterized rectangles as well as partitioning
                 parameterized 45-degree polygons into parameterized
                 trapezoids. Additionally, the algorithm can be used to
                 partition fixed-coordinate polygons. By adopting the
                 dual-scanline technique, which involves using two
                 scanlines to concurrently sweep an input polygon, the
                 parallel partitioning algorithm can process vertices
                 and edges of the input polygon efficiently. The
                 parallel polygon partitioning algorithm has been
                 implemented in C++ with the use of OpenMP. Compared
                 with a sequential partitioning program which uses a
                 single scanline, our parallel partitioning program can
                 achieve 20\% to 30\% speedup while partitioning large
                 parameterized polygons or partitioning parameterized
                 polygons with complex constraints.",
  acknowledgement = ack-nhfb,
  articleno =    "59",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Chen:2013:IRM,
  author =       "Zhezhe Chen and Qi Gao and Wenbin Zhang and Feng Qin",
  title =        "Improving the Reliability of {MPI} Libraries via
                 Message Flow Checking",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "24",
  number =       "3",
  pages =        "535--549",
  month =        mar,
  year =         "2013",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2012.127",
  ISSN =         "1045-9219",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed May 1 08:02:21 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Czapinski:2013:EPM,
  author =       "Michal Czapi{\'n}ski",
  title =        "An effective {Parallel Multistart Tabu Search for
                 Quadratic Assignment Problem} on {CUDA} platform",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "11",
  pages =        "1461--1468",
  month =        nov,
  year =         "2013",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon Sep 23 11:46:28 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151200175X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Dang:2013:CES,
  author =       "Hoang-Vu Dang and Bertil Schmidt",
  title =        "{CUDA}-enabled Sparse Matrix-Vector Multiplication on
                 {GPUs} using atomic operations",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "39",
  number =       "11",
  pages =        "737--750",
  month =        nov,
  year =         "2013",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Nov 29 10:01:37 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819113001178",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Demidov:2013:PCO,
  author =       "Denis Demidov and Karsten Ahnert and Karl Rupp and
                 Peter Gottschling",
  title =        "Programming {CUDA} and {OpenCL}: a Case Study Using
                 Modern {C++} Libraries",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "35",
  number =       "5",
  pages =        "C453--C472",
  month =        "????",
  year =         "2013",
  CODEN =        "SJOCE3",
  DOI =          "https://doi.org/10.1137/120903683",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  bibdate =      "Fri Mar 7 10:32:43 MST 2014",
  bibsource =    "http://epubs.siam.org/toc/sjoce3/35/5;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/siamjscicomput.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Scientific Computing",
  journal-URL =  "http://epubs.siam.org/sisc",
  onlinedate =   "January 2013",
}

@Article{Deo:2013:PSA,
  author =       "Mrinal Deo and Sean Keely",
  title =        "Parallel suffix array and least common prefix for the
                 {GPU}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "197--206",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Suffix Array (SA) is a data structure formed by
                 sorting the suffixes of a string into lexicographic
                 order. SAs have been used in a variety of applications,
                 most notably in pattern matching and Burrows--Wheeler
                 Transform (BWT) based lossless data compression. SAs
                 have also become the data structure of choice for many,
                 if not all, string processing problems to which suffix
                 tree methodology is applicable. Over the last two
                 decades researchers have proposed many suffix array
                 construction algorithm (SACAs). We do a systematic
                 study of the main classes of SACAs with the intent of
                 mapping them onto a data parallel architecture like the
                 GPU. We conclude that skew algorithm [12], a linear
                 time recursive algorithm, is the best candidate for
                 GPUs as all its phases can be efficiently mapped to a
                 data parallel hardware. Our OpenCL implementation of
                 skew algorithm achieves a throughput of up to 25
                 MStrings/sec and a speedup of up to 34x and 5.8x over a
                 single threaded CPU implementation using a discrete GPU
                 and APU respectively. We also compare our OpenCL
                 implementation against the fastest known CPU
                 implementation based on induced copying and achieve a
                 speedup of up to 3.7x. Using SA we construct BWT on GPU
                 and achieve a speedup of 11x over the fastest known BWT
                 on GPU. Suffix arrays are often augmented with the
                 longest common prefix (LCP) information. We design a
                 novel high-performance parallel algorithm for computing
                 LCP on the GPU. Our GPU implementation of LCP achieves
                 a speedup of up to 25x and 4.3x on discrete GPU and APU
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Ellingson:2013:SNU,
  author =       "Sally R. Ellingson and Jeremy C. Smith and Jerome
                 Baudry",
  title =        "Software News and Updates: {VinaMPI}: {Facilitating}
                 multiple receptor high-throughput virtual docking on
                 high-performance computers",
  journal =      j-J-COMPUT-CHEM,
  volume =       "34",
  number =       "25",
  pages =        "2212--2221",
  day =          "30",
  month =        sep,
  year =         "2013",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.23367",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Wed Nov 13 14:32:36 MST 2013",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  onlinedate =   "29 Jun 2013",
}

@Article{Friedley:2013:OPE,
  author =       "Andrew Friedley and Torsten Hoefler and Greg
                 Bronevetsky and Andrew Lumsdaine and Ching-Chen Ma",
  title =        "Ownership passing: efficient distributed memory
                 programming on multi-core systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "177--186",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "The number of cores in multi- and many-core
                 high-performance processors is steadily increasing.
                 MPI, the de-facto standard for programming
                 high-performance computing systems offers a distributed
                 memory programming model. MPI's semantics force a copy
                 from one process' send buffer to another process'
                 receive buffer. This makes it difficult to achieve the
                 same performance on modern hardware than shared memory
                 programs which are arguably harder to maintain and
                 debug. We propose generalizing MPI's communication
                 model to include ownership passing, which make it
                 possible to fully leverage the shared memory hardware
                 of multi- and many-core CPUs to stream communicated
                 data concurrently with the receiver's computations on
                 it. The benefits and simplicity of message passing are
                 retained by extending MPI with calls to send (pass)
                 ownership of memory regions, instead of their contents,
                 between processes. Ownership passing is achieved with a
                 hybrid MPI implementation that runs MPI processes as
                 threads and is mostly transparent to the user. We
                 propose an API and a static analysis technique to
                 transform legacy MPI codes automatically and
                 transparently to the programmer, demonstrating that
                 this scheme is easy to use in practice. Using the
                 ownership passing technique, we see up to 51\%
                 communication speedups over a standard message passing
                 implementation on state-of-the art multicore systems.
                 Our analysis and interface will lay the groundwork for
                 future development of MPI-aware optimizing compilers
                 and multi-core specific optimizations, which will be
                 key for success in current and next-generation
                 computing platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Gao:2013:GGA,
  author =       "Mingcen Gao and Thanh-Tung Cao and Ashwin Nanjappa and
                 Tiow-Seng Tan and Zhiyong Huang",
  title =        "{gHull}: a {GPU} algorithm for {$3$D} convex hull",
  journal =      j-TOMS,
  volume =       "40",
  number =       "1",
  pages =        "3:1--3:19",
  month =        sep,
  year =         "2013",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2513109.2513112",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Sep 30 16:05:58 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "A novel algorithm is presented to compute the convex
                 hull of a point set in R$^3$ using the graphics
                 processing unit (GPU). By exploiting the relationship
                 between the Voronoi diagram and the convex hull, the
                 algorithm derives the approximation of the convex hull
                 from the former. The other extreme vertices of the
                 convex hull are then found by using a two-round
                 checking in the digital and the continuous space
                 successively. The algorithm does not need explicit
                 locking or any other concurrency control mechanism,
                 thus it can maximize the parallelism available on the
                 modern GPU. The implementation using the CUDA
                 programming model on NVIDIA GPUs is exact and
                 efficient. The experiments show that it is up to an
                 order of magnitude faster than other sequential convex
                 hull implementations running on the CPU for inputs of
                 millions of points. The works demonstrate that the GPU
                 can be used to solve nontrivial computational geometry
                 problems with significant performance benefit.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Gardner:2013:CCE,
  author =       "Mark Gardner and Paul Sathre and Wu-chun Feng and
                 Gabriel Martinez",
  title =        "Characterizing the challenges and evaluating the
                 efficacy of a {CUDA-to-OpenCL} translator",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "39",
  number =       "12",
  pages =        "769--786",
  month =        dec,
  year =         "2013",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Dec 3 18:06:48 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819113001075",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Goglin:2013:KGS,
  author =       "Brice Goglin and St{\'e}phanie Moreaud",
  title =        "{KNEM}: a generic and scalable kernel-assisted
                 intra-node {MPI} communication framework",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "2",
  pages =        "176--188",
  month =        feb,
  year =         "2013",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2012.09.016",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Dec 13 20:22:17 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731512002316",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Grasso:2013:APS,
  author =       "Ivan Grasso and Klaus Kofler and Biagio Cosenza and
                 Thomas Fahringer",
  title =        "Automatic problem size sensitive task partitioning on
                 heterogeneous parallel systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "281--282",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "In this paper we propose a novel approach which
                 automatizes task partitioning in heterogeneous systems.
                 Our framework is based on the Insieme Compiler and
                 Runtime infrastructure. The compiler translates a
                 single-device OpenCL program into a multi-device OpenCL
                 program. The runtime system then performs dynamic task
                 partitioning based on an offline-generated prediction
                 model. In order to derive the prediction model, we use
                 a machine learning approach that incorporates static
                 program features as well as dynamic, input sensitive
                 features. Our approach has been evaluated over a suite
                 of 23 programs and achieves performance improvements
                 compared to an execution of the benchmarks on a single
                 CPU and a single GPU only.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Gu:2013:PCI,
  author =       "Zheng Gu and Matthew Small and Xin Yuan and Aniruddha
                 Marathe and David K. Lowenthal",
  title =        "Protocol Customization for Improving {MPI} Performance
                 on {RDMA}-Enabled Clusters",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "41",
  number =       "5",
  pages =        "682--703",
  month =        oct,
  year =         "2013",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-013-0242-0",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jun 22 12:29:26 MDT 2013",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=41&issue=5;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-013-0242-0",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Hadi:2013:CFA,
  author =       "Mohammed F. Hadi and Seyed A. Esmaeili",
  title =        "{CUDA Fortran} acceleration for the finite-difference
                 time-domain method",
  journal =      j-COMP-PHYS-COMM,
  volume =       "184",
  number =       "5",
  pages =        "1395--1400",
  month =        may,
  year =         "2013",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Mar 27 05:55:10 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465513000118",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Heimel:2013:HOP,
  author =       "Max Heimel and Michael Saecker and Holger Pirk and
                 Stefan Manegold and Volker Markl",
  title =        "Hardware-oblivious parallelism for in-memory
                 column-stores",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "6",
  number =       "9",
  pages =        "709--720",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "2150-8097",
  bibdate =      "Fri Dec 13 05:56:46 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "The multi-core architectures of today's computer
                 systems make parallelism a necessity for performance
                 critical applications. Writing such applications in a
                 generic, hardware-oblivious manner is a challenging
                 problem: Current database systems thus rely on
                 labor-intensive and error-prone manual tuning to
                 exploit the full potential of modern parallel hardware
                 architectures like multi-core CPUs and graphics cards.
                 We propose an alternative design for a parallel
                 database engine, based on a single set of
                 hardware-oblivious operators, which are compiled down
                 to the actual hardware at runtime. This design reduces
                 the development overhead for parallel database engines,
                 while achieving competitive performance to hand-tuned
                 systems. We provide a proof-of-concept for this design
                 by integrating operators written using the parallel
                 programming framework OpenCL into the open-source
                 database MonetDB. Following this approach, we achieve
                 efficient, yet highly portable parallel code without
                 the need for optimization by hand. We evaluated our
                 implementation against MonetDB using TPC-H derived
                 queries and observed a performance that rivals that of
                 MonetDB's query execution on the CPU and surpasses it
                 on the GPU. In addition, we show that the same set of
                 operators runs nearly unchanged on a GPU, demonstrating
                 the feasibility of our approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
}

@Article{Hilbrich:2013:MRE,
  author =       "Tobias Hilbrich and Joachim Protze and Martin Schulz
                 and Bronis R. de Supinski and Matthias S. M{\"u}ller",
  title =        "{MPI} runtime error detection with {MUST}: {Advances}
                 in deadlock detection",
  journal =      j-SCI-PROG,
  volume =       "21",
  number =       "3--4",
  pages =        "109--121",
  month =        "????",
  year =         "2013",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.3233/SPR-130368",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Sat Mar 8 14:11:02 MST 2014",
  bibsource =    "http://www.iospress.nl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Hoefler:2013:MMN,
  author =       "Torsten Hoefler and James Dinan and Darius Buntinas
                 and Pavan Balaji and Brian Barrett {\ldots}",
  title =        "{MPI $+$ MPI}: a new hybrid approach to parallel
                 programming with {MPI} plus shared memory",
  journal =      j-COMPUTING,
  volume =       "95",
  number =       "12",
  pages =        "1121--1136",
  month =        dec,
  year =         "2013",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-013-0324-2",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Wed Jan 29 10:10:11 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=95&issue=12;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-013-0324-2",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Hogg:2013:FDT,
  author =       "J. D. Hogg",
  title =        "A Fast Dense Triangular Solve in {CUDA}",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "35",
  number =       "3",
  pages =        "C303--C322",
  month =        "????",
  year =         "2013",
  CODEN =        "SJOCE3",
  DOI =          "https://doi.org/10.1137/12088358X",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  bibdate =      "Fri Jul 19 07:43:53 MDT 2013",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SISC/35/3;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/siamjscicomput.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Scientific Computing",
  journal-URL =  "http://epubs.siam.org/sisc",
  onlinedate =   "January 2013",
}

@Article{Huang:2013:ACM,
  author =       "Libo Huang and Zhiying Wang and Nong Xiao and Yongwen
                 Wang and Qiang Dou",
  title =        "Adaptive communication mechanism for accelerating
                 {MPI} functions in {NoC}-based multicore processors",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512434",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multicore designs have emerged as the dominant
                 organization for future high-performance
                 microprocessors. Communication in such designs is often
                 enabled by Networks-on-Chip (NoCs). A new trend in such
                 architectures is to fit a Message Passing Interface
                 (MPI) programming model on NoCs to achieve optimal
                 parallel application performance. A key issue in
                 designing MPI over NoCs is communication protocol,
                 which has not been explored in previous research. This
                 article advocates a hardware-supported communication
                 mechanism using a protocol-adaptive approach to adjust
                 to varying NoC configurations (e.g., number of buffers)
                 and workload behavior (e.g., number of messages). We
                 propose the ADaptive Communication Mechanism (ADCM), a
                 hybrid protocol that involves behavior similar to
                 buffered communication when sufficient buffer is
                 available in the receiver to that similar to a
                 synchronous protocol when buffers in the receiver are
                 limited. ADCM adapts dynamically by deciding
                 communication protocol on a per-request basis using a
                 local estimate of recent buffer utilization. ADCM
                 attempts to combine both the advantages of buffered and
                 synchronous communication modes to achieve enhanced
                 throughput and performance. Simulations of various
                 workloads show that the proposed communication
                 mechanism can be effectively used in future NoC
                 designs.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jimenez:2013:BCA,
  author =       "Jes{\'u}s Jim{\'e}nez and Juan {Ruiz de Miras}",
  title =        "Box-counting algorithm on {GPU} and multi-core {CPU}:
                 an {OpenCL} cross-platform study",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "65",
  number =       "3",
  pages =        "1327--1352",
  month =        sep,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-0885-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Feb 8 11:06:43 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=65&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-0885-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Jin:2013:PCU,
  author =       "Hui Jin and Xian-He Sun",
  title =        "Performance comparison under failures of {MPI} and
                 {MapReduce}: an analytical approach",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "29",
  number =       "7",
  pages =        "1808--1815",
  month =        sep,
  year =         "2013",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Aug 26 16:08:23 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X13000290",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Jog:2013:OCT,
  author =       "Adwait Jog and Onur Kayiran and Nachiappan Chidambaram
                 Nachiappan and Asit K. Mishra and Mahmut T. Kandemir
                 and Onur Mutlu and Ravishankar Iyer and Chita R. Das",
  title =        "{OWL}: cooperative thread array aware scheduling
                 techniques for improving {GPGPU} performance",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "395--406",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging GPGPU architectures, along with programming
                 models like CUDA and OpenCL, offer a cost-effective
                 platform for many applications by providing high thread
                 level parallelism at lower energy budgets.
                 Unfortunately, for many general-purpose applications,
                 available hardware resources of a GPGPU are not
                 efficiently utilized, leading to lost opportunity in
                 improving performance. A major cause of this is the
                 inefficiency of current warp scheduling policies in
                 tolerating long memory latencies. In this paper, we
                 identify that the scheduling decisions made by such
                 policies are agnostic to thread-block, or cooperative
                 thread array (CTA), behavior, and as a result
                 inefficient. We present a coordinated CTA-aware
                 scheduling policy that utilizes four schemes to
                 minimize the impact of long memory latencies. The first
                 two schemes, CTA-aware two-level warp scheduling and
                 locality aware warp scheduling, enhance per-core
                 performance by effectively reducing cache contention
                 and improving latency hiding capability. The third
                 scheme, bank-level parallelism aware warp scheduling,
                 improves overall GPGPU performance by enhancing DRAM
                 bank-level parallelism. The fourth scheme employs
                 opportunistic memory-side prefetching to further
                 enhance performance by taking advantage of open DRAM
                 rows. Evaluations on a 28-core GPGPU platform with
                 highly memory-intensive applications indicate that our
                 proposed mechanism can provide 33\% average performance
                 improvement compared to the commonly-employed
                 round-robin warp scheduling policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kegel:2013:DTU,
  author =       "Philipp Kegel and Michel Steuwer and Sergei Gorlatch",
  title =        "{dOpenCL}: Towards uniform programming of distributed
                 heterogeneous multi-\slash many-core systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "12",
  pages =        "1639--1648",
  month =        dec,
  year =         "2013",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Nov 29 09:55:28 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731513001597",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Khanna:2013:HPN,
  author =       "Gaurav Khanna",
  title =        "High-Precision Numerical Simulations on a {CUDA GPU}:
                 {Kerr} Black Hole Tails",
  journal =      j-J-SCI-COMPUT,
  volume =       "56",
  number =       "2",
  pages =        "366--380",
  month =        aug,
  year =         "2013",
  CODEN =        "JSCOEB",
  DOI =          "https://doi.org/10.1007/s10915-012-9679-3",
  ISSN =         "0885-7474 (print), 1573-7691 (electronic)",
  ISSN-L =       "0885-7474",
  bibdate =      "Sat Mar 8 11:16:21 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7474&volume=56&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jscicomput.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10915-012-9679-3;
                 http://link.springer.com/content/pdf/10.1007/s10915-012-9679-3.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Scientific Computing",
  journal-URL =  "http://link.springer.com/journal/10915",
}

@Article{Kim:2013:MPE,
  author =       "Yooseong Kim and Aviral Shrivastava",
  title =        "Memory performance estimation of {CUDA} programs",
  journal =      j-TECS,
  volume =       "13",
  number =       "2",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2514641.2514648",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Fri Sep 27 18:13:13 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "CUDA has successfully popularized GPU computing, and
                 GPGPU applications are now used in various embedded
                 systems. The CUDA programming model provides a simple
                 interface to program on GPUs, but tuning GPGPU
                 applications for high performance is still quite
                 challenging. Programmers need to consider numerous
                 architectural details, and small changes in source
                 code, especially on the memory access pattern, can
                 affect performance significantly. This makes it very
                 difficult to optimize CUDA programs. This article
                 presents CuMAPz, which is a tool to analyze and compare
                 the memory performance of CUDA programs. CuMAPz can
                 help programmers explore different ways of using shared
                 and global memories, and optimize their program for
                 efficient memory behavior. CuMAPz models several
                 memory-performance-related factors: data reuse, global
                 memory access coalescing, global memory latency hiding,
                 shared memory bank conflict, channel skew, and branch
                 divergence. Experimental results show that CuMAPz can
                 accurately estimate performance with correlation
                 coefficient of 0.96. By using CuMAPz to explore the
                 memory access design space, we could improve the
                 performance of our benchmarks by 30\% more than the
                 previous approach [Hong and Kim 2010].",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
}

@Article{Krotkiewski:2013:ESC,
  author =       "Marcin Krotkiewski and Marcin Dabrowski",
  title =        "Efficient {$3$D} stencil computations using {CUDA}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "39",
  number =       "10",
  pages =        "533--548",
  month =        oct,
  year =         "2013",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Sep 30 16:37:36 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S016781911300094X",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Kruzel:2013:VOI,
  author =       "Filip Kruzel and Krzysztof Bana{\'s}",
  title =        "Vectorized {OpenCL} implementation of numerical
                 integration for higher order finite elements",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "66",
  number =       "10",
  pages =        "2030--2044",
  month =        dec,
  year =         "2013",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:51:22 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S089812211300521X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Kuckuk:2013:IPD,
  author =       "Sebastian Kuckuk and Tobias Preclik and Harald
                 K{\"o}stler",
  title =        "Interactive particle dynamics using {OpenCL} and
                 {Kinect}",
  journal =      j-INT-J-PAR-EMER-DIST-SYS,
  volume =       "28",
  number =       "6",
  pages =        "519--536",
  year =         "2013",
  DOI =          "https://doi.org/10.1080/17445760.2012.745671",
  bibdate =      "Thu Mar 6 05:45:37 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/intjparemerdistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel, Emergent and
                 Distributed Systems: IJPEDS",
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
}

@Article{Kumar:2013:GAI,
  author =       "Piyush Kumar and Anupam Agrawal",
  title =        "GPU-Accelerated Interactive Visualization of {$ 3 D $}
                 Volumetric Data Using {CUDA}",
  journal =      j-INT-J-IMAGE-GRAPHICS,
  volume =       "13",
  number =       "2",
  pages =        "??--??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "0219-4678",
  ISSN-L =       "0219-4678",
  bibdate =      "Tue Aug 6 10:37:51 MDT 2013",
  bibsource =    "http://ejournals.wspc.com.sg/ijig/ijig.shtml;
                 http://www.math.utah.edu/pub/tex/bib/ijig.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://doi.acm.org/10.1142/S0219467813400032",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Image and Graphics (IJIG)",
  journal-URL =  "http://www.worldscientific.com/worldscinet/ijig",
}

@Article{Kunaseth:2013:ASD,
  author =       "Manaschai Kunaseth and David F. Richards and James N.
                 Glosli",
  title =        "Analysis of scalable data-privatization threading
                 algorithms for hybrid {MPI\slash OpenMP}
                 parallelization of molecular dynamics",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "66",
  number =       "1",
  pages =        "406--430",
  month =        oct,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-0915-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Feb 8 11:13:32 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=66&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-0915-x",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Li:2013:COM,
  author =       "Hung-Fu Li and Tyng-Yeu Liang and Jun-Yao Chiu",
  title =        "A compound {OpenMP\slash MPI} program development
                 toolkit for hybrid {CPU\slash GPU} clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "66",
  number =       "1",
  pages =        "381--405",
  month =        oct,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-0912-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Feb 8 11:13:32 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=66&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-0912-0",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Liu:2013:DLO,
  author =       "Jun Liu and Wei Ding and Ohyoung Jang and Mahmut
                 Kandemir",
  title =        "Data layout optimization for {GPGPU} architectures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "283--284",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442546",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "GPUs are being widely used in accelerating
                 general-purpose applications, leading to the emergence
                 of GPGPU architectures. New programming models, e.g.,
                 Compute Unified Device Architecture (CUDA), have been
                 proposed to facilitate programming general-purpose
                 computations in GPGPUs. However, writing
                 high-performance CUDA codes manually is still tedious
                 and difficult. In particular, the organization of the
                 data in the memory space can greatly affect the
                 performance due to the unique features of a custom
                 GPGPU memory hierarchy. In this work, we propose an
                 automatic data layout transformation framework to solve
                 the key issues associated with a GPGPU memory hierarchy
                 (i.e., channel skewing, data coalescing, and bank
                 conflicts). Our approach employs a widely applicable
                 strategy based on a novel concept called data
                 localization. Specifically, we try to optimize the
                 layout of the arrays accessed in affine loop nests, for
                 both the device memory and shared memory, at both
                 coarse grain and fine grain parallelization levels. We
                 performed an experimental evaluation of our data layout
                 optimization strategy using 15 benchmarks on an NVIDIA
                 CUDA GPU device. The results show that the proposed
                 data transformation approach brings around 4.3X speedup
                 on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Lu:2013:MLP,
  author =       "Ligang Lu and Karen Magerlein",
  title =        "Multi-level parallel computing of reverse time
                 migration for seismic imaging on {Blue Gene/Q}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "291--292",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442550",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Blue Gene/Q (BG/Q) is an early representative of
                 increasing scale and thread count that will
                 characterize future HPC systems: large counts of nodes,
                 cores, and threads; and a rich programming environment
                 with many degrees of freedom in parallel computing
                 optimization. So it is both a challenge and an
                 opportunity to it to accelerate the seismic imaging
                 applications to the unprecedented levels that will
                 significantly advance the technologies for the oil and
                 gas industry. In this work we aim to address two
                 important questions: how HPC systems with high levels
                 of scale and thread count will perform in real
                 applications; and how systems with many degrees of
                 freedom in parallel programming can be calibrated to
                 achieve optimal performance. Based on BG/Q's
                 architecture features and RTM workload characteristics,
                 we developed massive domain partition, MPI, and SIMD
                 Our detailed deep analyses in various aspects of
                 optimization also provide valuable experience and
                 insights into how can be utilized to facilitate the
                 advance of seismic imaging technologies. Our BG/Q RTM
                 solution achieved a 14.93x speedup over the BG/P
                 implementation. Our multi-level parallelism strategies
                 for Reverse Time Migration (RTM) seismic imaging
                 computing on BG/Q provides an example of how HPC
                 systems like BG/Q can accelerate applications to a new
                 level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Lu:2013:WGA,
  author =       "Xiangwen Lu and Jiabin Yuan and Weiwei Zhang",
  title =        "Workflow of the {Grover} algorithm simulation
                 incorporating {CUDA} and {GPGPU}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "184",
  number =       "9",
  pages =        "2035--2041",
  month =        sep,
  year =         "2013",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Aug 26 14:34:22 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465513001148",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Ma:2013:KAT,
  author =       "Teng Ma and George Bosilca and Aurelien Bouteiller and
                 Jack J. Dongarra",
  title =        "Kernel-assisted and topology-aware {MPI} collective
                 communications on multicore\slash many-core platforms",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "7",
  pages =        "1000--1010",
  month =        jul,
  year =         "2013",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon Aug 26 16:44:35 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731513000166",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Mohamed:2013:MMM,
  author =       "Hisham Mohamed and St{\'e}phane Marchand-Maillet",
  title =        "{MRO-MPI}: {MapReduce} overlapping using {MPI} and an
                 optimized data exchange policy",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "39",
  number =       "12",
  pages =        "851--866",
  month =        dec,
  year =         "2013",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Dec 3 18:06:48 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819113001026",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Nandivada:2013:TFO,
  author =       "V. Krishna Nandivada and Jun Shirako and Jisheng Zhao
                 and Vivek Sarkar",
  title =        "A Transformation Framework for Optimizing
                 Task-Parallel Programs",
  journal =      j-TOPLAS,
  volume =       "35",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2013",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2450136.2450138",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Apr 30 18:56:06 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Task parallelism has increasingly become a trend with
                 programming models such as OpenMP 3.0, Cilk, Java
                 Concurrency, X10, Chapel and Habanero-Java (HJ) to
                 address the requirements of multicore programmers.
                 While task parallelism increases productivity by
                 allowing the programmer to express multiple levels of
                 parallelism, it can also lead to performance
                 degradation due to increased overheads. In this
                 article, we introduce a transformation framework for
                 optimizing task-parallel programs with a focus on task
                 creation and task termination operations. These
                 operations can appear explicitly in constructs such as
                 async, finish in X10 and HJ, task, taskwait in OpenMP
                 3.0, and spawn, sync in Cilk, or implicitly in
                 composite code statements such as foreach and ateach
                 loops in X10, forall and foreach loops in HJ, and
                 parallel loop in OpenMP. Our framework includes a
                 definition of data dependence in task-parallel
                 programs, a happens-before analysis algorithm, and a
                 range of program transformations for optimizing task
                 parallelism. Broadly, our transformations cover three
                 different but interrelated optimizations: (1)
                 finish-elimination, (2) forall-coarsening, and (3)
                 loop-chunking. Finish-elimination removes redundant
                 task termination operations, forall-coarsening replaces
                 expensive task creation and termination operations with
                 more efficient synchronization operations, and
                 loop-chunking extracts useful parallelism from ideal
                 parallelism. All three optimizations are specified in
                 an iterative transformation framework that applies a
                 sequence of relevant transformations until a fixed
                 point is reached. Further, we discuss the impact of
                 exception semantics on the specified transformations,
                 and extend them to handle task-parallel programs with
                 precise exception semantics. Experimental results were
                 obtained for a collection of task-parallel benchmarks
                 on three multicore platforms: a dual-socket 128-thread
                 (16-core) Niagara T2 system, a quad-socket 16-core
                 Intel Xeon SMP, and a quad-socket 32-core Power7 SMP.
                 We have observed that the proposed optimizations
                 interact with each other in a synergistic way, and
                 result in an overall geometric average performance
                 improvement between 6.28$ \times $ and 10.30$ \times $,
                 measured across all three platforms for the benchmarks
                 studied.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Pai:2013:IGC,
  author =       "Sreepathi Pai and Matthew J. Thazhuthaveetil and R.
                 Govindarajan",
  title =        "Improving {GPGPU} concurrency with elastic kernels",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "407--418",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Each new generation of GPUs vastly increases the
                 resources available to GPGPU programs. GPU programming
                 models (like CUDA) were designed to scale to use these
                 resources. However, we find that CUDA programs actually
                 do not scale to utilize all available resources, with
                 over 30\% of resources going unused on average for
                 programs of the Parboil2 suite that we used in our
                 work. Current GPUs therefore allow concurrent execution
                 of kernels to improve utilization. In this work, we
                 study concurrent execution of GPU kernels using
                 multiprogram workloads on current NVIDIA Fermi GPUs. On
                 two-program workloads from the Parboil2 benchmark suite
                 we find concurrent execution is often no better than
                 serialized execution. We identify that the lack of
                 control over resource allocation to kernels is a major
                 serialization bottleneck. We propose transformations
                 that convert CUDA kernels into elastic kernels which
                 permit fine-grained control over their resource usage.
                 We then propose several elastic-kernel aware
                 concurrency policies that offer significantly better
                 performance and concurrency compared to the current
                 CUDA policy. We evaluate our proposals on real hardware
                 using multiprogrammed workloads constructed from
                 benchmarks in the Parboil 2 suite. On average, our
                 proposals increase system throughput (STP) by 1.21x and
                 improve the average normalized turnaround time (ANTT)
                 by 3.73x for two-program workloads when compared to the
                 current CUDA concurrency implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Papakonstantinou:2013:ECC,
  author =       "Alexandros Papakonstantinou and Karthik Gururaj and
                 John A. Stratton and Deming Chen and Jason Cong and
                 Wen-Mei W. Hwu",
  title =        "Efficient compilation of {CUDA} kernels for
                 high-performance computing on {FPGAs}",
  journal =      j-TECS,
  volume =       "13",
  number =       "2",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2514641.2514652",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Fri Sep 27 18:13:13 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "The rise of multicore architectures across all
                 computing domains has opened the door to heterogeneous
                 multiprocessors, where processors of different compute
                 characteristics can be combined to effectively boost
                 the performance per watt of different application
                 kernels. GPUs, in particular, are becoming very popular
                 for speeding up compute-intensive kernels of
                 scientific, imaging, and simulation applications. New
                 programming models that facilitate parallel processing
                 on heterogeneous systems containing GPUs are spreading
                 rapidly in the computing community. By leveraging these
                 investments, the developers of other accelerators have
                 an opportunity to significantly reduce the programming
                 effort by supporting those accelerator models already
                 gaining popularity. In this work, we adapt one such
                 language, the CUDA programming model, into a new FPGA
                 design flow called FCUDA, which efficiently maps the
                 coarse- and fine-grained parallelism exposed in CUDA
                 onto the reconfigurable fabric. Our CUDA-to-FPGA flow
                 employs AutoPilot, an advanced high-level synthesis
                 tool (available from Xilinx) which enables
                 high-abstraction FPGA programming. FCUDA is based on a
                 source-to-source compilation that transforms the SIMT
                 (Single Instruction, Multiple Thread) CUDA code into
                 task-level parallel C code for AutoPilot. We describe
                 the details of our CUDA-to-FPGA flow and demonstrate
                 the highly competitive performance of the resulting
                 customized FPGA multicore accelerators. To the best of
                 our knowledge, this is the first CUDA-to-FPGA flow to
                 demonstrate the applicability and potential advantage
                 of using the CUDA programming model for
                 high-performance computing in FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
}

@Article{Pennycook:2013:IPP,
  author =       "S. J. Pennycook and S. D. Hammond and S. A. Wright and
                 J. A. Herdman and I. Miller and S. A. Jarvis",
  title =        "An investigation of the performance portability of
                 {OpenCL}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "11",
  pages =        "1439--1450",
  month =        nov,
  year =         "2013",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon Sep 23 11:46:28 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731512001669",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Poulson:2013:ENF,
  author =       "Jack Poulson and Bryan Marker and Robert A. van de
                 Geijn and Jeff R. Hammond and Nichols A. Romero",
  title =        "{Elemental}: a New Framework for Distributed Memory
                 Dense Matrix Computations",
  journal =      j-TOMS,
  volume =       "39",
  number =       "2",
  pages =        "13:1--13:24",
  month =        feb,
  year =         "2013",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2427023.2427030",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Feb 20 16:46:13 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "Parallelizing dense matrix computations to distributed
                 memory architectures is a well-studied subject and
                 generally considered to be among the best understood
                 domains of parallel computing. Two packages, developed
                 in the mid 1990s, still enjoy regular use: ScaLAPACK
                 and PLAPACK. With the advent of many-core
                 architectures, which may very well take the shape of
                 distributed memory architectures within a single
                 processor, these packages must be revisited since the
                 traditional MPI-based approaches will likely need to be
                 extended. Thus, this is a good time to review lessons
                 learned since the introduction of these two packages
                 and to propose a simple yet effective alternative.
                 Preliminary performance results show the new solution
                 achieves competitive, if not superior, performance on
                 large clusters.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Ragan-Kelley:2013:HLC,
  author =       "Jonathan Ragan-Kelley and Connelly Barnes and Andrew
                 Adams and Sylvain Paris and Fr{\'e}do Durand and Saman
                 Amarasinghe",
  title =        "{Halide}: a language and compiler for optimizing
                 parallelism, locality, and recomputation in image
                 processing pipelines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "519--530",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Image processing pipelines combine the challenges of
                 stencil computations and stream programs. They are
                 composed of large graphs of different stencil stages,
                 as well as complex reductions, and stages with global
                 or data-dependent access patterns. Because of their
                 complex structure, the performance difference between a
                 naive implementation of a pipeline and an optimized one
                 is often an order of magnitude. Efficient
                 implementations require optimization of both
                 parallelism and locality, but due to the nature of
                 stencils, there is a fundamental tension between
                 parallelism, locality, and introducing redundant
                 recomputation of shared values. We present a systematic
                 model of the tradeoff space fundamental to stencil
                 pipelines, a schedule representation which describes
                 concrete points in this space for each stage in an
                 image processing pipeline, and an optimizing compiler
                 for the Halide image processing language that
                 synthesizes high performance implementations from a
                 Halide algorithm and a schedule. Combining this
                 compiler with stochastic search over the space of
                 schedules enables terse, composable programs to achieve
                 state-of-the-art performance on a wide range of real
                 image processing pipelines, and across different
                 hardware architectures, including multicores with SIMD,
                 and heterogeneous CPU+GPU execution. From simple Halide
                 programs written in a few hours, we demonstrate
                 performance up to 5x faster than hand-tuned C,
                 intrinsics, and CUDA implementations optimized by
                 experts over weeks or months, for image processing
                 applications beyond the reach of past automatic
                 compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Reyes:2013:PEO,
  author =       "Ruym{\'a}n Reyes and Iv{\'a}n L{\'o}pez and Juan J.
                 Fumero and Francisco de Sande",
  title =        "A preliminary evaluation of {OpenACC}
                 implementations",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "65",
  number =       "3",
  pages =        "1063--1075",
  month =        sep,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-012-0853-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Feb 8 10:21:44 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=65&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-012-0853-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Rodrigues:2013:MAA,
  author =       "A. Wendell O. Rodrigues and Fr{\'e}d{\'e}ric
                 Guyomarc'h and Jean-Luc Dekeyser",
  title =        "An {MDE} Approach for Automatic Code Generation from
                 {UML\slash MARTE} to {OpenCL}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "15",
  number =       "1",
  pages =        "46--55",
  month =        jan # "\slash " # feb,
  year =         "2013",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2012.35",
  ISSN =         "1521-9615",
  ISSN-L =       "1521-9615",
  bibdate =      "Fri Jun 21 08:34:49 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Rodrigues:2013:POM,
  author =       "Eduardo R. Rodrigues and Philippe O. A. Navaux and
                 Jairo Panetta and Celso L. Mendes",
  title =        "Preserving the original {MPI} semantics in a
                 virtualized processor environment",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "78",
  number =       "4",
  pages =        "412--421",
  day =          "1",
  month =        apr,
  year =         "2013",
  CODEN =        "SCPGD4",
  DOI =          "https://doi.org/10.1016/j.scico.2012.07.005",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Mon Feb 4 10:59:59 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib;
                 http://www.sciencedirect.com/science/journal/01676423",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167642312001335",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
  remark =       "Secial section on Mutation Testing and Analysis
                 (Mutation 2010) \& Special section on the Programming
                 Languages track at the 25th ACM Symposium on Applied
                 Computing.",
}

@Article{Rosen:2013:PVA,
  author =       "Paul Rosen",
  title =        "Performance: A Visual Approach to Investigating Shared
                 and Global Memory Behavior of {CUDA} Kernels",
  journal =      j-CGF,
  volume =       "32",
  number =       "3pt2",
  pages =        "161--170",
  month =        jun,
  year =         "2013",
  CODEN =        "CGFODY",
  DOI =          "https://doi.org/10.1111/cgf.12103",
  ISSN =         "0167-7055 (print), 1467-8659 (electronic)",
  ISSN-L =       "0167-7055",
  bibdate =      "Sat Feb 8 15:27:43 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cgf.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Graphics World",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1467-8659/",
  onlinedate =   "1 Jul 2013",
}

@Article{Sampaio:2013:DA,
  author =       "Diogo Sampaio and Rafael Martins de Souza and Sylvain
                 Collange and Fernando Magno Quint{\~a}o Pereira",
  title =        "Divergence analysis",
  journal =      j-TOPLAS,
  volume =       "35",
  number =       "4",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2013",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2523815",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Dec 31 14:22:03 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Growing interest in graphics processing units has
                 brought renewed attention to the Single Instruction
                 Multiple Data (SIMD) execution model. SIMD machines
                 give application developers tremendous computational
                 power; however, programming them is still challenging.
                 In particular, developers must deal with memory and
                 control-flow divergences. These phenomena stem from a
                 condition that we call data divergence, which occurs
                 whenever two processing elements (PEs) see the same
                 variable name holding different values. This article
                 introduces divergence analysis, a static analysis that
                 discovers data divergences. This analysis, currently
                 deployed in an industrial quality compiler, is useful
                 in several ways: it improves the translation of SIMD
                 code to non-SIMD CPUs, it helps developers to manually
                 improve their SIMD applications, and it also guides the
                 automatic optimization of SIMD programs. We demonstrate
                 this last point by introducing the notion of a
                 divergence-aware register spiller. This spiller uses
                 information from our analysis to either rematerialize
                 or share common data between PEs. As a testimony of its
                 effectiveness, we have tested it on a suite of 395 CUDA
                 kernels from well-known benchmarks. The
                 divergence-aware spiller produces GPU code that is
                 26.21\% faster than the code produced by the register
                 allocator used in the baseline compiler.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Shen:2013:ACE,
  author =       "Jie Shen and Jianbin Fang and Henk Sips and Ana Lucia
                 Varbanescu",
  title =        "An application-centric evaluation of {OpenCL} on
                 multi-core {CPUs}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "39",
  number =       "12",
  pages =        "834--850",
  month =        dec,
  year =         "2013",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Dec 3 18:06:48 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819113001014",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{SM-D:2013:BRC,
  author =       "SM-D",
  title =        "Book Review: {{\booktitle{CUDA Programming}}, Shane
                 Cook. Morgan Kaufmann. ISBN 978-0-12-415933-4}",
  journal =      j-NETWORK-SECURITY,
  volume =       "2013",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2013",
  CODEN =        "NTSCF5",
  DOI =          "https://doi.org/10.1016/S1353-4858(13)70015-1",
  ISSN =         "1353-4858 (print), 1872-9371 (electronic)",
  ISSN-L =       "1353-4858",
  bibdate =      "Mon Dec 4 17:00:50 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/network-security.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S1353485813700151",
  acknowledgement = ack-nhfb,
  fjournal =     "Network Security",
  journal-URL =  "https://www.sciencedirect.com/journal/network-security",
}

@Article{Totoni:2013:EFE,
  author =       "Ehsan Totoni and Mert Dikmen and Mar{\'\i}a Jes{\'u}s
                 Garzar{\'a}n",
  title =        "Easy, fast, and energy-efficient object detection on
                 heterogeneous on-chip architectures",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "45:1--45:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541228.2555302",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We optimize a visual object detection application
                 (that uses Vision Video Library kernels) and show that
                 OpenCL is a unified programming paradigm that can
                 provide high performance when running on the Ivy Bridge
                 heterogeneous on-chip architecture. We evaluate
                 different mapping techniques and show that running each
                 kernel where it fits the best and using software
                 pipelining can provide 1.91 times higher performance
                 and 42\% better energy efficiency. We also show how to
                 trade accuracy for energy at runtime. Overall, our
                 application can perform accurate object detection at 40
                 frames per second (fps) in an energy-efficient
                 manner.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vaidya:2013:SDO,
  author =       "Aniruddha S. Vaidya and Anahita Shayesteh and Dong
                 Hyuk Woo and Roy Saharoy and Mani Azimi",
  title =        "{SIMD} divergence optimization through intra-warp
                 compaction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "368--379",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485954",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "SIMD execution units in GPUs are increasingly used for
                 high performance and energy efficient acceleration of
                 general purpose applications. However, SIMD control
                 flow divergence effects can result in reduced execution
                 efficiency in a class of GPGPU applications, classified
                 as divergent applications. Improving SIMD efficiency,
                 therefore, has the potential to bring significant
                 performance and energy benefits to a wide range of such
                 data parallel applications. Recently, the SIMD
                 divergence problem has received increased attention,
                 and several micro-architectural techniques have been
                 proposed to address various aspects of this problem.
                 However, these techniques are often quite complex and,
                 therefore, unlikely candidates for practical
                 implementation. In this paper, we propose two
                 micro-architectural optimizations for GPGPU
                 architectures, which utilize relatively simple
                 execution cycle compression techniques when certain
                 groups of turned-off lanes exist in the instruction
                 stream. We refer to these optimizations as basic cycle
                 compression (BCC) and swizzled-cycle compression (SCC),
                 respectively. In this paper, we will outline the
                 additional requirements for implementing these
                 optimizations in the context of the studied GPGPU
                 architecture. Our evaluations with divergent SIMD
                 workloads from OpenCL (GPGPU) and OpenGL (graphics)
                 applications show that BCC and SCC reduce execution
                 cycles in divergent applications by as much as 42\%
                 (20\% on average). For a subset of divergent workloads,
                 the execution time is reduced by an average of 7\% for
                 today's GPUs or by 18\% for future GPUs with a better
                 provisioned memory subsystem. The key contribution of
                 our work is in simplifying the micro-architecture for
                 delivering divergence optimizations while providing the
                 bulk of the benefits of more complex approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Vogel:2013:BWC,
  author =       "Thomas Vogel",
  title =        "{{\booktitle{All the Way to CUDA}}} [Book review]",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "15",
  number =       "5",
  pages =        "6--8",
  month =        sep # "\slash " # oct,
  year =         "2013",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2013.101",
  ISSN =         "1521-9615",
  ISSN-L =       "1521-9615",
  bibdate =      "Sat Apr 19 10:17:39 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Wang:2013:PMO,
  author =       "Cheng Wang and Sunita Chandrasekaran and Peng Sun and
                 Barbara Chapman and Jim Holt",
  title =        "Portable mapping of {openMP} to multicore embedded
                 systems using {MCA APIs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "153--162",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multicore embedded systems are being widely used in
                 telecommunication systems, robotics, medical
                 applications and more.While they offer a
                 high-performance with low-power solution, programming
                 in an efficient way is still a challenge. In order to
                 exploit the capabilities that the hardware offers,
                 software developers are expected to handle many of the
                 low-level details of programming including utilizing
                 DMA, ensuring cache coherency, and inserting
                 synchronization primitives explicitly. The
                 state-of-the-art involves solutions where the software
                 toolchain is too vendor-specific thus tying the
                 software to a particular hardware leaving no room-for
                 portability. In this paper we present a runtime system
                 to explore mapping a high-level programming model,
                 OpenMP, on to multicore embedded systems. A key feature
                 of our scheme is that unlike the existing approaches
                 that largely rely on POSIX threads, our approach
                 leverages the Multicore Association (MCA) APIs as an
                 OpenMP translation layer. The MCA APIs is a set of
                 low-level APIs handling resource management,
                 inter-process communications and task scheduling for
                 multicore embedded systems. By deploying the MCA APIs,
                 our runtime is able to effectively capture the
                 characteristics of multicore embedded systems compared
                 with the POSIX threads. Furthermore, the MCA layer
                 enables our runtime implementation to be portable
                 across various architectures. Thus programmers only
                 need to maintain a single OpenMP code base which is
                 compatible by various compilers, while on the other
                 hand, the code is portable across different possible
                 types of platforms. We have evaluated our runtime
                 system using several embedded benchmarks. The
                 experiments demonstrate promising and competitive
                 performance compared to the native approach for the
                 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Wu:2013:PMH,
  author =       "Xingfu Wu and Valerie Taylor",
  title =        "Performance modeling of hybrid {MPI\slash OpenMP}
                 scientific applications on large-scale multicore
                 supercomputers",
  journal =      j-J-COMP-SYS-SCI,
  volume =       "79",
  number =       "8",
  pages =        "1256--1268",
  month =        dec,
  year =         "2013",
  CODEN =        "JCSSBM",
  DOI =          "https://doi.org/10.1016/j.jcss.2013.02.005",
  ISSN =         "0022-0000 (print), 1090-2724 (electronic)",
  ISSN-L =       "0022-0000",
  bibdate =      "Tue Jan 29 15:27:23 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcompsyssci.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0022000013000639",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computer and System Sciences",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00220000",
}

@Article{Xu:2013:PMO,
  author =       "Shiming Xu and Wei Xue and Hai Xiang Lin",
  title =        "Performance modeling and optimization of sparse
                 matrix-vector multiplication on {NVIDIA CUDA}
                 platform",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "63",
  number =       "3",
  pages =        "710--721",
  month =        mar,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-011-0626-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Apr 1 14:50:47 MDT 2013",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=63&issue=3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-011-0626-0;
                 http://link.springer.com/content/pdf/10.1007/s11227-011-0626-0",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Yan:2013:SFS,
  author =       "Shengen Yan and Guoping Long and Yunquan Zhang",
  title =        "{StreamScan}: fast scan algorithms for {GPUs} without
                 global barrier synchronization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "229--238",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Scan (also known as prefix sum) is a very useful
                 primitive for various important parallel algorithms,
                 such as sort, BFS, SpMV, compaction and so on. Current
                 state of the art of GPU based scan implementation
                 consists of three consecutive Reduce-Scan-Scan phases.
                 This approach requires at least two global barriers and
                 3N (N is the problem size) global memory accesses. In
                 this paper we propose StreamScan, a novel approach to
                 implement scan on GPUs with only one computation phase.
                 The main idea is to restrict synchronization to only
                 adjacent workgroups, and thereby eliminating global
                 barrier synchronization completely. The new approach
                 requires only 2N global memory accesses and just one
                 kernel invocation. On top of this we propose two
                 important optimizations to further boost performance
                 speedups, namely thread grouping to eliminate
                 unnecessary local barriers, and register optimization
                 to expand the on chip problem size. We designed an
                 auto-tuning framework to search the parameter space
                 automatically to generate highly optimized codes for
                 both AMD and Nvidia GPUs. We implemented our technique
                 with OpenCL. Compared with previous fast scan
                 implementations, experimental results not only show
                 promising performance speedups, but also reveal
                 dramatic different optimization tradeoffs between
                 Nvidia and AMD GPU platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Yu:2013:AGA,
  author =       "Zhibin Yu and Lieven Eeckhout and Nilanjan Goswami and
                 Tao Li and Lizy John and Hai Jin and Chengzhong Xu",
  title =        "Accelerating {GPGPU} architecture simulation",
  journal =      j-SIGMETRICS,
  volume =       "41",
  number =       "1",
  pages =        "331--332",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2494232.2465540",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Feb 28 06:09:59 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "Recently, graphics processing units (GPUs) have opened
                 up new opportunities for speeding up general-purpose
                 parallel applications due to their massive
                 computational power and up to hundreds of thousands of
                 threads enabled by programming models such as CUDA.
                 However, due to the serial nature of existing
                 micro-architecture simulators, these massively parallel
                 architectures and workloads need to be simulated
                 sequentially. As a result, simulating GPGPU
                 architectures with typical benchmarks and input data
                 sets is extremely time-consuming. This paper addresses
                 the GPGPU architecture simulation challenge by
                 generating miniature, yet representative GPGPU kernels.
                 We first summarize the static characteristics of an
                 existing GPGPU kernel in a profile, and analyze its
                 dynamic behavior using the novel concept of the
                 divergence flow statistics graph (DFSG). We
                 subsequently use a GPGPU kernel synthesizing framework
                 to generate a miniature proxy of the original kernel,
                 which can reduce simulation time significantly. The key
                 idea is to reduce the number of simulated instructions
                 by decreasing per-thread iteration counts of loops. Our
                 experimental results show that our approach can
                 accelerate GPGPU architecture simulation by a factor of
                 88X on average and up to 589X with an average IPC
                 relative error of 5.6\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Zhang:2013:MPI,
  author =       "Xiaohua Zhang and Sergio E. Wong and Felice C.
                 Lightstone",
  title =        "Message passing interface and multithreading hybrid
                 for parallel molecular docking of large databases on
                 petascale high performance computing machines",
  journal =      j-J-COMPUT-CHEM,
  volume =       "34",
  number =       "11",
  pages =        "915--927",
  day =          "30",
  month =        apr,
  year =         "2013",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.23214",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Mon Apr 1 14:26:54 MDT 2013",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  onlinedate =   "23 Jan 2013",
}

@Article{Amritkar:2014:EPC,
  author =       "Amit Amritkar and Surya Deb and Danesh Tafti",
  title =        "Efficient parallel {CFD-DEM} simulations using
                 {OpenMP}",
  journal =      j-J-COMPUT-PHYS,
  volume =       "256",
  number =       "??",
  pages =        "501--519",
  day =          "1",
  month =        jan,
  year =         "2014",
  CODEN =        "JCTPAH",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Wed Nov 13 14:21:07 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999113006128",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Antonelli:2014:ATS,
  author =       "Laura Antonelli and Stefania Corsaro and Zelda Marino
                 and Mariarosaria Rizzardi",
  title =        "Algorithm 944: {Talbot} Suite: Parallel
                 Implementations of {Talbot}'s Method for the Numerical
                 Inversion of {Laplace} Transforms",
  journal =      j-TOMS,
  volume =       "40",
  number =       "4",
  pages =        "29:1--29:18",
  month =        jun,
  year =         "2014",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2616909",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Jul 2 18:28:58 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "We present Talbot Suite, a C parallel software
                 collection for the numerical inversion of Laplace
                 Transforms, based on Talbot's method. It is designed to
                 fit both single and multiple Laplace inversion
                 problems, which arise in several application and
                 research fields. In our software, we achieve high
                 accuracy and efficiency, making full use of modern
                 architectures and introducing two different levels of
                 parallelism: coarse and fine grained parallelism. They
                 offer a reasonable tradeoff between accuracy, the main
                 aspect for a few inversions, and efficiency, the main
                 aspect for multiple inversions. To take into account
                 modern high-performance computing architectures, Talbot
                 Suite provides different software versions: an
                 OpenMP-based version for shared memory machines and a
                 MPI-based version for distributed memory machines.
                 Moreover, oriented to hybrid architectures, a combined
                 MPI/OpenMP-based implementation is provided too. We
                 describe our parallel algorithms and the software
                 organization. We also report some performance results.
                 Our software includes sample programs to call the
                 Talbot Suite functions from C and from MATLAB.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Awile:2014:PWF,
  author =       "Omar Awile and Ivo F. Sbalzarini",
  title =        "A {Pthreads} Wrapper for {Fortran 2003}",
  journal =      j-TOMS,
  volume =       "40",
  number =       "3",
  pages =        "19:1--19:15",
  month =        apr,
  year =         "2014",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2558889",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Apr 21 17:42:14 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "With the advent of multicore processors, numerical and
                 mathematical software relies on parallelism in order to
                 benefit from hardware performance increases. We present
                 the design and use of a Fortran 2003 wrapper for POSIX
                 threads, called forthreads. Forthreads is complete in
                 the sense that is provides native Fortran 2003
                 interfaces to all pthreads routines where possible. We
                 demonstrate the use and efficiency of forthreads for
                 SIMD parallelism and task parallelism. We present
                 forthreads/MPI implementations that enable hybrid
                 shared-/distributed-memory parallelism in Fortran 2003.
                 Our benchmarks show that forthreads offers performance
                 comparable to that of OpenMP, but better thread control
                 and more freedom. We demonstrate the latter by
                 presenting a multithreaded Fortran 2003 library for
                 POSIX Internet sockets, enabling interactive numerical
                 simulations with runtime control.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Barrett:2014:EMM,
  author =       "Brian W. Barrett and Ron Brightwell and Ryan Grant and
                 Simon D. Hammond and K. Scott Hemmert",
  title =        "An evaluation of {MPI} message rate on hybrid-core
                 processors",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "415--424",
  month =        nov,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014552085",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/415",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Beaugnon:2014:VVO,
  author =       "Ulysse Beaugnon and Alexey Kravets and Sven van
                 Haastregt and Riyadh Baghdadi and David Tweed and Javed
                 Absar and Anton Lokhmotov",
  title =        "{VOBLA}: a vehicle for optimized basic linear
                 algebra",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "115--124",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597818",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present VOBLA, a domain-specific language designed
                 for programming linear algebra libraries. VOBLA is
                 compiled to PENCIL, a domain independent intermediate
                 language designed for efficient mapping to accelerator
                 architectures such as GPGPUs. PENCIL is compiled to
                 efficient, platform-specific OpenCL code using
                 techniques based on the polyhedral model. This approach
                 addresses both the programmer productivity and
                 performance portability concerns associated with
                 accelerator programming. We demonstrate our approach by
                 using VOBLA to implement a BLAS library. We have
                 evaluated the performance of OpenCL code generated
                 using our compilation flow on ARM Mali, AMD Radeon, and
                 AMD Opteron platforms. The generated code is currently
                 on average 1.9x slower than highly hand-optimized
                 OpenCL code, but on average 8.1x faster than
                 straightforward OpenCL code. Given that the VOBLA
                 coding takes significantly less effort compared to
                 hand-optimizing OpenCL code, we believe our approach
                 leads to improved productivity and performance
                 portability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Bethune:2014:PAA,
  author =       "Iain Bethune and J. Mark Bull and Nicholas J. Dingle
                 and Nicholas J. Higham",
  title =        "Performance analysis of asynchronous {Jacobi}'s method
                 implemented in {MPI}, {SHMEM} and {OpenMP}",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "1",
  pages =        "97--111",
  month =        feb,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342013493123",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Mar 14 15:39:59 MDT 2014",
  bibsource =    "http://hpc.sagepub.com/content/28/1.toc;
                 http://www.math.utah.edu/pub/bibnet/authors/h/higham-nicholas-john.bib;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/1/97.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "July 11, 2013",
}

@Article{Blas:2014:RAM,
  author =       "Javier Garcia Blas and Jesus Carretero",
  title =        "Recent advances in the {Message Passing Interface}",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "387--389",
  month =        nov,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014549273",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/387",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Coole:2014:FFH,
  author =       "James Coole and Greg Stitt",
  title =        "Fast, Flexible High-Level Synthesis from {OpenCL}
                 using Reconfiguration Contexts",
  journal =      j-IEEE-MICRO,
  volume =       "34",
  number =       "1",
  pages =        "42--53",
  month =        jan # "\slash " # feb,
  year =         "2014",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2013.108",
  ISSN =         "0272-1732",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Aug 21 08:02:34 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Cores:2014:FAM,
  author =       "Iv{\'a}n Cores and Gabriel Rodr{\'\i}guez and Patricia
                 Gonz{\'a}lez and Mar{\'\i}a J. Mart{\'\i}n",
  title =        "Failure Avoidance in {MPI} Applications Using an
                 Application-Level Approach",
  journal =      j-COMP-J,
  volume =       "57",
  number =       "1",
  pages =        "100--114",
  month =        jan,
  year =         "2014",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxs158",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Mon Feb 3 17:02:40 MST 2014",
  bibsource =    "http://comjnl.oxfordjournals.org/content/57/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://comjnl.oxfordjournals.org/content/57/1/100.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "December 18, 2012",
}

@Article{Cores:2014:MAL,
  author =       "Iv{\'a}n Cores and Gabriel Rodr{\'\i}guez and
                 Mar{\'\i}a J. Mart{\'\i}n",
  title =        "In-memory application-level checkpoint-based migration
                 for {MPI} programs",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "70",
  number =       "2",
  pages =        "660--670",
  month =        nov,
  year =         "2014",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1120-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Feb 13 12:32:19 MST 2015",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=70&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1120-2",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Cunningham:2014:RXE,
  author =       "David Cunningham and David Grove and Benjamin Herta
                 and Arun Iyengar and Kiyokuni Kawachiya and Hiroki
                 Murata and Vijay Saraswat and Mikio Takeuchi and
                 Olivier Tardieu",
  title =        "Resilient {X10}: efficient failure-aware programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "67--80",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555248",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scale-out programs run on multiple processes in a
                 cluster. In scale-out systems, processes can fail.
                 Computations using traditional libraries such as MPI
                 fail when any component process fails. The advent of
                 Map Reduce, Resilient Data Sets and MillWheel has shown
                 dramatic improvements in productivity are possible when
                 a high-level programming framework handles scale-out
                 and resilience automatically. We are concerned with the
                 development of general-purpose languages that support
                 resilient programming. In this paper we show how the
                 X10 language and implementation can be extended to
                 support resilience. In Resilient X10, places may fail
                 asynchronously, causing loss of the data and tasks at
                 the failed place. Failure is exposed through
                 exceptions. We identify a {\em Happens Before
                 Invariance Principle} and require the runtime to
                 automatically repair the global control structure of
                 the program to maintain this principle. We show this
                 reduces much of the burden of resilient programming.
                 The programmer is only responsible for continuing
                 execution with fewer computational resources and the
                 loss of part of the heap, and can do so while taking
                 advantage of domain knowledge. We build a complete
                 implementation of the language, capable of executing
                 benchmark applications on hundreds of nodes. We
                 describe the algorithms required to make the language
                 runtime resilient. We then give three applications,
                 each with a different approach to fault tolerance
                 (replay, decimation, and domain-level checkpointing).
                 These can be executed at scale and survive node
                 failure. We show that for these programs the overhead
                 of resilience is a small fraction of overall runtime by
                 comparing to equivalent non-resilient X10 programs. On
                 one program we show end-to-end performance of Resilient
                 X10 is ~100x faster than Hadoop.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{DAgostino:2014:CAM,
  author =       "Daniele D'Agostino and Andrea Clematis and Sergio
                 Decherchi and Walter Rocchia and Luciano Milanesi and
                 Ivan Merelli",
  title =        "{CUDA} accelerated molecular surface generation",
  journal =      j-CCPE,
  volume =       "26",
  number =       "10",
  pages =        "1819--1831",
  month =        jul,
  year =         "2014",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3120",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Sep 9 16:46:30 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "15 Aug 2013",
}

@Article{Didelot:2014:IMC,
  author =       "Sylvain Didelot and Patrick Carribault and Marc
                 P{\'e}rache and William Jalby",
  title =        "Improving {MPI} communication overlap with
                 collaborative polling",
  journal =      j-COMPUTING,
  volume =       "96",
  number =       "4",
  pages =        "263--278",
  month =        apr,
  year =         "2014",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-013-0327-z",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Fri Jun 6 10:07:21 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-013-0327-z",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Dinan:2014:ECC,
  author =       "James Dinan and Ryan E. Grant and Pavan Balaji and
                 David Goodell and Douglas Miller and Marc Snir and
                 Rajeev Thakur",
  title =        "Enabling communication concurrency through flexible
                 {MPI} endpoints",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "390--405",
  month =        nov,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014548772",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/390",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "September 23, 2014",
}

@Article{DiPierro:2014:PPP,
  author =       "Massimo {Di Pierro}",
  title =        "Portable Parallel Programs with {Python} and
                 {OpenCL}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "16",
  number =       "1",
  pages =        "34--40",
  month =        jan # "\slash " # feb,
  year =         "2014",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2013.99",
  ISSN =         "1521-9615",
  ISSN-L =       "1521-9615",
  bibdate =      "Sat Apr 19 10:17:39 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Fang:2014:API,
  author =       "Jianbin Fang and Henk Sips and Ana Lucia Varbanescu",
  title =        "{Aristotle}: A performance impact indicator for the
                 {OpenCL} kernels using local memory",
  journal =      j-SCI-PROG,
  volume =       "22",
  number =       "3",
  pages =        "239--257",
  month =        "????",
  year =         "2014",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.3233/SPR-140390",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 9 18:01:15 MDT 2014",
  bibsource =    "http://www.iospress.nl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprog.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Feng:2014:MSP,
  author =       "Chunsheng Feng and Shi Shu and Jinchao Xu and
                 Chen-Song Zhang",
  title =        "A Multi-Stage Preconditioner for the Black Oil Model
                 and Its {OpenMP} Implementation",
  crossref =     "Erhel:2014:DDM",
  volume =       "98",
  pages =        "141--153",
  year =         "2014",
  DOI =          "https://doi.org/10.1007/978-3-319-05789-7_11",
  bibdate =      "Sat Dec 12 10:22:13 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-319-05789-7_11/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-319-05789-7",
  book-URL =     "http://www.springerlink.com/content/978-3-319-05789-7",
}

@Article{Feng:2014:SBS,
  author =       "Xiaowen Feng and Hai Jin and Ran Zheng and Zhiyuan
                 Shao and Lei Zhu",
  title =        "A segment-based sparse matrix--vector multiplication
                 on {CUDA}",
  journal =      j-CCPE,
  volume =       "26",
  number =       "1",
  pages =        "271--286",
  month =        jan,
  year =         "2014",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.2978",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Feb 8 15:45:08 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "7 Dec 2012",
}

@Article{Gerstenberger:2014:EHS,
  author =       "Robert Gerstenberger and Maciej Besta and Torsten
                 Hoefler",
  title =        "Enabling highly-scalable remote memory access
                 programming with {MPI-3 One Sided}",
  journal =      j-SCI-PROG,
  volume =       "22",
  number =       "2",
  pages =        "75--91",
  month =        "????",
  year =         "2014",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.3233/SPR-140383",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 9 18:01:01 MDT 2014",
  bibsource =    "http://www.iospress.nl/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprog.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Gonina:2014:SMC,
  author =       "Ekaterina Gonina and Gerald Friedland and Eric
                 Battenberg and Penporn Koanantakool and Michael
                 Driscoll and Evangelos Georganas and Kurt Keutzer",
  title =        "Scalable multimedia content analysis on parallel
                 platforms using {Python}",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2517151",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this new era dominated by consumer-produced media
                 there is a high demand for web-scalable solutions to
                 multimedia content analysis. A compelling approach to
                 making applications scalable is to explicitly map their
                 computation onto parallel platforms. However,
                 developing efficient parallel implementations and fully
                 utilizing the available resources remains a challenge
                 due to the increased code complexity, limited
                 portability and required low-level knowledge of the
                 underlying hardware. In this article, we present
                 PyCASP, a Python-based framework that automatically
                 maps computation onto parallel platforms from Python
                 application code to a variety of parallel platforms.
                 PyCASP is designed using a systematic, pattern-oriented
                 approach to offer a single software development
                 environment for multimedia content analysis
                 applications. Using PyCASP, applications can be
                 prototyped in a couple hundred lines of Python code and
                 automatically scale to modern parallel processors.
                 Applications written with PyCASP are portable to a
                 variety of parallel platforms and efficiently scale
                 from a single desktop Graphics Processing Unit (GPU) to
                 an entire cluster with a small change to application
                 code. To illustrate our approach, we present three
                 multimedia content analysis applications that use our
                 framework: a state-of-the-art speaker diarization
                 application, a content-based music recommendation
                 system based on the Million Song Dataset, and a video
                 event detection system for consumer-produced videos. We
                 show that across this wide range of applications, our
                 approach achieves the goal of automatic portability and
                 scalability while at the same time allowing easy
                 prototyping in a high-level language and efficient
                 performance of low-level optimized code.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Guerrero:2014:PCM,
  author =       "Gin{\'e}s D. Guerrero and Richard M. Wallace and
                 Jos{\'e} L. V{\'a}zquez-Poletti and Jos{\'e} M. Cecilia
                 and Jos{\'e} M. Garc{\'\i}a and Daniel Mozos and
                 Horacio P{\'e}rez-S{\'a}nchez",
  title =        "A performance\slash cost model for a {CUDA} drug
                 discovery application on physical and public cloud
                 infrastructures",
  journal =      j-CCPE,
  volume =       "26",
  number =       "10",
  pages =        "1787--1798",
  month =        jul,
  year =         "2014",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3117",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Sep 9 16:46:30 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 Aug 2013",
}

@Article{Hall:2014:MMC,
  author =       "Clifford Hall and Weixiao Ji and Estela
                 Blaisten-Barojas",
  title =        "The {Metropolis Monte Carlo} method with {CUDA}
                 enabled {Graphic Processing Units}",
  journal =      j-J-COMPUT-PHYS,
  volume =       "258",
  number =       "??",
  pages =        "871--879",
  day =          "1",
  month =        feb,
  year =         "2014",
  CODEN =        "JCTPAH",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Mon Dec 23 10:39:12 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999113007626",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991/",
}

@Book{Hanson:2014:NCM,
  author =       "Richard J. Hanson and Tim Hopkins",
  title =        "Numerical computing with modern {Fortran}",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "xv + 244",
  year =         "2014",
  ISBN =         "1-61197-311-2 (paperback), 1-61197-312-0 (e-book)",
  ISBN-13 =      "978-1-61197-311-2 (paperback), 978-1-61197-312-9
                 (e-book)",
  LCCN =         "QA76.73.F25 H367 2013",
  bibdate =      "Wed Mar 12 11:09:16 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/numana2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Applied mathematics",
  abstract =     "The Fortran language standard has undergone
                 significant upgrades in recent years (1990, 1995, 2003,
                 and 2008). \booktitle{Numerical Computing with Modern
                 Fortran} illustrates many of these improvements through
                 practical solutions to a number of scientific and
                 engineering problems. Readers will discover: techniques
                 for modernizing algorithms written in Fortran; examples
                 of Fortran interoperating with C or C++ programs, plus
                 using the IEEE floating-point standard for efficiency;
                 illustrations of parallel Fortran programming using
                 coarrays, MPI, and OpenMP; and a supplementary website
                 with downloadable source codes discussed in the book.",
  acknowledgement = ack-nhfb,
  subject =      "FORTRAN (Computer program language); Numerical
                 analysis; Computer programs; Science; Mathematics",
  tableofcontents = "Introduction \\
                 The modern Fortran source \\
                 Modules for subprogram libraries \\
                 Generic subprograms \\
                 Sparse matrices, defined operations, overloaded
                 assignment \\
                 Object-oriented programming for numerical applications
                 \\
                 Recursion in Fortran \\
                 Case study: toward a modern QUADPACK routine \\
                 Case study: quadrature routine qag2003 \\
                 IEEE arithmetic features and exception handling \\
                 Interoperability with C \\
                 Defined operations for sparse matrix solutions \\
                 Case study: two sparse least-squares system examples
                 \\
                 Message passing with MPI in standard Fortran \\
                 Coarrays in standard Fortran \\
                 OpenMP in Fortran \\
                 Modifying source to remove obsolescent or deleted
                 features \\
                 Software testing \\
                 Compilers \\
                 Software tools \\
                 Fortran book code on SIAM web site \\
                 Bibliography \\
                 Index",
}

@InProceedings{Haynes:2014:MOA,
  author =       "Ronald D. Haynes and Benjamin W. Ong",
  title =        "{MPI--OpenMP} Algorithms for the Parallel Space-Time
                 Solution of Time Dependent {PDEs}",
  crossref =     "Erhel:2014:DDM",
  volume =       "98",
  pages =        "179--187",
  year =         "2014",
  DOI =          "https://doi.org/10.1007/978-3-319-05789-7_14",
  bibdate =      "Sat Dec 12 10:22:13 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-319-05789-7_14/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-319-05789-7",
  book-URL =     "http://www.springerlink.com/content/978-3-319-05789-7",
}

@Article{Holmen:2014:ASI,
  author =       "John K. Holmen and David L. Foster",
  title =        "Accelerating Single Iteration Performance of
                 {CUDA}--Based {$3$D} Reaction--Diffusion Simulations",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "42",
  number =       "2",
  pages =        "343--363",
  month =        apr,
  year =         "2014",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-013-0251-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Thu Mar 13 19:25:13 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=42&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See erratum \cite{Holmen:2014:EAS}.",
  URL =          "http://link.springer.com/article/10.1007/s10766-013-0251-z",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Holmen:2014:EAS,
  author =       "John K. Holmen and David L. Foster",
  title =        "Erratum to: Accelerating Single Iteration Performance
                 of {CUDA}--Based {$3$D} Reaction--Diffusion
                 Simulations",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "42",
  number =       "2",
  pages =        "364--364",
  month =        apr,
  year =         "2014",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-014-0305-x",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Thu Mar 13 19:25:13 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=42&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See \cite{Holmen:2014:ASI}.",
  URL =          "http://link.springer.com/content/pdf/10.1007/s10766-014-0305-x.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Jenkins:2014:PMD,
  author =       "John Jenkins and James Dinan and Pavan Balaji and Tom
                 Peterka and Nagiza F. Samatova and Rajeev Thakur",
  title =        "Processing {MPI} Derived Datatypes on Noncontiguous
                 {GPU}-Resident Data",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "25",
  number =       "10",
  pages =        "2627--2637",
  month =        oct,
  year =         "2014",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 12 13:58:32 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2014/10/06600679-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2014/10/06600679-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Jie:2014:ASP,
  author =       "Liang Jie and KenLi Li and Lin Shi and RangSu Liu and
                 Jing Mei",
  title =        "Accelerating solidification process simulation for
                 large-sized system of liquid metal atoms using {GPU}
                 with {CUDA}",
  journal =      j-J-COMPUT-PHYS,
  volume =       "257",
  number =       "??",
  pages =        "521--535",
  day =          "15",
  month =        jan,
  year =         "2014",
  CODEN =        "JCTPAH",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Sat Nov 30 14:26:13 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999113006803",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Joldes:2014:SSH,
  author =       "Mioara Joldes and Valentina Popescu and Warwick
                 Tucker",
  title =        "Searching for Sinks for the {H{\'e}non} Map using a
                 Multiple-precision {GPU} Arithmetic Library",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "63--68",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693726",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today, GPUs represent an important hardware
                 development platform for many problems in dynamical
                 systems, where massive parallel computations are
                 needed. Beside that, many numerical studies of chaotic
                 dynamical systems require a computing precision higher
                 than common floating point (FP) formats. One such
                 application is locating invariant sets for chaotic
                 dynamical systems. In particular, we focus on
                 rigorously proving the existence of stable periodic
                 orbits for the H{\'e}non map for parameter values close
                 to the classical ones. For that, we present a
                 multiple-precision floating-point arithmetic library in
                 CUDA programming language for the NVIDIA GPU platform.
                 Our library extends the precision using so-called FP
                 expansions, where a number is represented as the
                 unevaluated sum of standard machine precision FP
                 numbers. This format offers the advantage of using
                 directly available and highly optimized hardware FP
                 operations. We generalize algorithms used by
                 multiple-precisions libraries such as Bailey's QD, or
                 the analogue GPU version, GQD.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Jung:2014:MCM,
  author =       "Jaewoon Jung and Takaharu Mori and Yuji Sugita",
  title =        "Midpoint cell method for hybrid {(MPI + OpenMP)}
                 parallelization of molecular dynamics simulations",
  journal =      j-J-COMPUT-CHEM,
  volume =       "35",
  number =       "14",
  pages =        "1064--1072",
  day =          "30",
  month =        may,
  year =         "2014",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.23591",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Wed Aug 27 06:34:07 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0192-8651",
  onlinedate =   "23 Mar 2014",
}

@Article{Kamal:2014:IFG,
  author =       "Humaira Kamal and Alan Wagner",
  title =        "An integrated fine-grain runtime system for {MPI}",
  journal =      j-COMPUTING,
  volume =       "96",
  number =       "4",
  pages =        "293--309",
  month =        apr,
  year =         "2014",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-013-0329-x",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Fri Jun 6 10:07:21 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-013-0329-x",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Kim:2014:VVF,
  author =       "Young-Joo Kim and Sejun Song and Yong-Kee Jun",
  title =        "{VORD}: A Versatile On-the-fly Race Detection Tool in
                 {OpenMP} Programs",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "42",
  number =       "6",
  pages =        "900--930",
  month =        dec,
  year =         "2014",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-013-0257-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 10 07:13:09 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=42&issue=6;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-013-0257-6",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Komura:2014:CPG,
  author =       "Yukihiro Komura and Yutaka Okabe",
  title =        "{CUDA} programs for the {GPU} computing of the
                 {Swendsen--Wang} multi-cluster spin flip algorithm:
                 {$2$D} and {$3$D} {Ising}, {Potts}, and {$ X Y $}
                 models",
  journal =      j-COMP-PHYS-COMM,
  volume =       "185",
  number =       "3",
  pages =        "1038--1043",
  month =        mar,
  year =         "2014",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Feb 4 19:25:59 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465513003743",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Kumar:2014:OMC,
  author =       "Sameer Kumar and Amith Mamidala and Philip
                 Heidelberger and Dong Chen and Daniel Faraj",
  title =        "Optimization of {MPI} collective operations on the
                 {IBM Blue Gene/Q} supercomputer",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "450--464",
  month =        nov,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014552086",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/450",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Langr:2014:APP,
  author =       "Daniel Langr and Pavel Tvrd{\'\i}k and Tom{\'a}s
                 Dytrych and Jerry P. Draayer",
  title =        "{Algorithm 947}: {Paraperm} --- Parallel Generation of
                 Random Permutations with {MPI}",
  journal =      j-TOMS,
  volume =       "41",
  number =       "1",
  pages =        "5:1--5:26",
  month =        oct,
  year =         "2014",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2669372",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Oct 27 16:37:25 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "An algorithm for parallel generation of a random
                 permutation of a large set of distinct integers is
                 presented. This algorithm is designed for massively
                 parallel systems with distributed memory architectures
                 and the MPI-based runtime environments. Scalability of
                 the algorithm is analyzed according to the memory and
                 communication requirements. An implementation of the
                 algorithm in a form of a software library based on the
                 C++ programming language and the MPI application
                 programming interface is further provided. Finally,
                 performed experiments are described and their results
                 discussed. The biggest of these experiments resulted in
                 a generation of a random permutation of $ 2^{41} $
                 integers in slightly more than four minutes using
                 131072 CPU cores.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{LaSalle:2014:MBD,
  author =       "Dominique LaSalle and George Karypis",
  title =        "{MPI} for Big Data: New tricks for an old dog",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "40",
  number =       "10",
  pages =        "754--767",
  month =        dec,
  year =         "2014",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Nov 24 12:48:48 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819114000830",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Lee:2014:BCA,
  author =       "Changmin Lee and Won Woo Ro and Jean-Luc Gaudiot",
  title =        "Boosting {CUDA} Applications with {CPU--GPU} Hybrid
                 Computing",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "42",
  number =       "2",
  pages =        "384--404",
  month =        apr,
  year =         "2014",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-013-0252-y",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Thu Mar 13 19:25:13 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=42&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-013-0252-y",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Losada:2014:EAL,
  author =       "N. Losada and M. J. Mart{\'\i}n and G. Rodr{\'\i}guez
                 and P. Gonz{\'a}lez",
  title =        "Extending an Application-Level Checkpointing Tool to
                 Provide Fault Tolerance Support to {OpenMP}
                 Applications",
  journal =      j-J-UCS,
  volume =       "20",
  number =       "9",
  pages =        "1351--??",
  month =        "????",
  year =         "2014",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Fri Feb 13 11:25:50 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jucs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.jucs.org/jucs_20_9/extending_an_application_level",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Article{Luo:2014:ISM,
  author =       "Miao Luo and Xiaoyi Lu and Khaled Hamidouche and
                 Krishna Kandalla and Dhabaleswar K. Panda",
  title =        "Initial study of multi-endpoint runtime for {MPI +
                 OpenMP} hybrid programming model on multi-core
                 systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "395--396",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555287",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "State-of-the-art MPI libraries rely on locks to
                 guarantee thread-safety. This discourages application
                 developers from using multiple threads to perform MPI
                 operations. In this paper, we propose a high
                 performance, lock-free multi-endpoint MPI runtime,
                 which can achieve up to 40\% improvement for
                 point-to-point operation and one representative
                 collective operation with minimum or no modifications
                 to the existing applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Mitra:2014:AAP,
  author =       "Subrata Mitra and Ignacio Laguna and Dong H. Ahn and
                 Saurabh Bagchi and Martin Schulz and Todd Gamblin",
  title =        "Accurate application progress analysis for large-scale
                 parallel debugging",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "193--203",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594336",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging large-scale parallel applications is
                 challenging. In most HPC applications, parallel tasks
                 progress in a coordinated fashion, and thus a fault in
                 one task can quickly propagate to other tasks, making
                 it difficult to debug. Finding the least-progressed
                 tasks can significantly reduce the effort to identify
                 the task where the fault originated. However, existing
                 approaches for detecting them suffer low accuracy and
                 large overheads; either they use imprecise static
                 analysis or are unable to infer progress dependence
                 inside loops. We present a loop-aware
                 progress-dependence analysis tool, Prodometer, which
                 determines relative progress among parallel tasks via
                 dynamic analysis. Our fault-injection experiments
                 suggest that its accuracy and precision are over 90\%
                 for most cases and that it scales well up to 16,384 MPI
                 tasks. Further, our case study shows that it
                 significantly helped diagnosing a perplexing error in
                 MPI, which only manifested at large scale.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Molero-Armenta:2014:OOI,
  author =       "M. Molero-Armenta and Ursula Iturrar{\'a}n-Viveros and
                 S. Aparicio and M. G. Hern{\'a}ndez",
  title =        "Optimized {OpenCL} implementation of the
                 {Elastodynamic Finite Integration Technique} for
                 viscoelastic media",
  journal =      j-COMP-PHYS-COMM,
  volume =       "185",
  number =       "10",
  pages =        "2683--2696",
  month =        oct,
  year =         "2014",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Aug 16 08:37:41 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465514001702",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Morishima:2014:PEG,
  author =       "Shin Morishima and Hiroki Matsutani",
  title =        "Performance Evaluations of Graph Database using {CUDA}
                 and {OpenMP} Compatible Libraries",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "75--80",
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693728",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Graph databases use graph structures to store data
                 sets as nodes, edges, and properties. They are used to
                 store and search the relationships between a large
                 number of nodes, such as social networking services and
                 recommendation engines that use customer social graphs.
                 Since computation cost for graph search queries
                 increases as the graph becomes large, in this paper we
                 accelerate the graph search functions (Dijkstra and A*
                 algorithms) of a graph database Neo4j using two ways:
                 multithreaded library and CUDA library for graphics
                 processing units (GPUs). We use 100,000-node graphs
                 generated based on a degree distribution of Facebook
                 social graph for evaluations. Our multi-threaded and
                 GPU-based implementations require an auxiliary
                 adjacency matrix for a target graph. The results show
                 that, when we do not take into account additional
                 overhead to generate the auxiliary adjacency matrix,
                 multi-threaded version improves the Dijkstra and A*
                 search performance by 16.2x and 13.8x compared to the
                 original implementation. The GPU-based implementation
                 improves the Dijkstra and A* search performance by
                 26.2x and 32.8x. When we take into account the
                 overhead, although the speed-ups by our implementations
                 are reduced, by reusing the auxiliary adjacency matrix
                 for multiple graph search queries we can significantly
                 improve the graph search performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Nomura:2014:PAM,
  author =       "Shimpei Nomura and Takuji Mitsuishi and Jun Suzuki and
                 Yuki Hayashi and Masaki Kan and Hideharu Amano",
  title =        "Performance Analysis of the Multi-{GPU} System with
                 {ExpEther}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "9--14",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693717",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "A GPU cluster in which each node provides a few GPUs
                 connected with PCIe (PCI Express) is commonly used for
                 acceleration of a large application program requiring
                 the performance beyond a single GPU. However, in such a
                 system, programmers are required to describe two
                 parallel programming between nodes in MPIs or other
                 message passing library as well as the fine grained
                 parallel programming for intra-GPUs. As a cost
                 effective alternative of such clusters, we propose a
                 novel multi-GPU system with ExpEther, a virtualization
                 technique which extends PCIe of a host CPU to Ethernet.
                 All devices connected by ExpEther can be treated as if
                 they were directly connected to the host. Evaluation
                 with two application programs with and without GPU-GPU
                 communication revealed that the proposed system with
                 four GPUs achieved 3.88 and 3.29 times performance
                 improvement respectively compared with a single GPU
                 system. Compared with GPU cluster system in which each
                 node provides a GPU, the proposed system achieved about
                 7\% and 30\% performance improvement, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Olukotun:2014:BPP,
  author =       "Kunle Olukotun",
  title =        "Beyond parallel programming with domain specific
                 languages",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "179--180",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2557966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today, almost all computer architectures are parallel
                 and heterogeneous; a combination of multiple CPUs, GPUs
                 and specialized processors. This creates a challenging
                 problem for application developers who want to develop
                 high performance programs without the effort required
                 to use low-level, architecture specific parallel
                 programming models (e.g., OpenMP for CMPs, CUDA for
                 GPUs, MPI for clusters). Domain-specific languages
                 (DSLs) are a promising solution to this problem because
                 they can provide an avenue for high-level
                 application-specific abstractions with implicit
                 parallelism to be mapped directly to low level
                 architecture-specific programming models; providing
                 both high programmer productivity and high execution
                 performance. In this talk I will describe an approach
                 to building high performance DSLs, which is based on
                 DSL embedding in a general purpose programming
                 language, metaprogramming and a DSL infrastructure
                 called Delite. I will describe how we transform DSL
                 programs into efficient first-order low-level code
                 using domain specific optimization, parallelism and
                 locality optimization with parallel patterns, and
                 architecture-specific code generation. All
                 optimizations and transformations are implemented in
                 Delite: an extensible DSL compiler infrastucture that
                 significantly reduces the effort required to develop
                 new DSLs. Delite DSLs for machine learning, data
                 querying, graph analysis, and scientific computing all
                 achieve performance competitive with manually
                 parallelized C++ code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Pal:2014:PMH,
  author =       "Anirban Pal and Abhishek Agarwala and Soumyendu Raha
                 and Baidurya Bhattacharya",
  title =        "Performance metrics in a hybrid {MPI--OpenMP} based
                 molecular dynamics simulation with short-range
                 interactions",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "74",
  number =       "3",
  pages =        "2203--2214",
  month =        mar,
  year =         "2014",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Tue Jan 28 12:39:53 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731513002505",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315/",
}

@Article{Panda:2014:GAM,
  author =       "Dhabaleswar K. Panda",
  title =        "{GPU}-Aware {MPI} on {RDMA}-Enabled Clusters: Design,
                 Implementation and Evaluation",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "25",
  number =       "10",
  pages =        "2595--2605",
  month =        oct,
  year =         "2014",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 12 13:58:32 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2014/10/06587715-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2014/10/06587715-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Pawliczek:2014:VED,
  author =       "Piotr Pawliczek and Witold Dzwinel and David A. Yuen",
  title =        "Visual exploration of data by using multidimensional
                 scaling on multicore {CPU}, {GPU}, and {MPI} cluster",
  journal =      j-CCPE,
  volume =       "26",
  number =       "3",
  pages =        "662--682",
  day =          "10",
  month =        mar,
  year =         "2014",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3027",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Feb 27 14:51:21 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "30 Apr 2013",
}

@Article{Pena:2014:CEC,
  author =       "Antonio J. Pe{\~n}a and Carlos Rea{\~n}o and Federico
                 Silla and Rafael Mayo and Enrique S. Quintana-Ort{\'\i}
                 and Jos{\'e} Duato",
  title =        "A complete and efficient {CUDA}-sharing solution for
                 {HPC} clusters",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "40",
  number =       "10",
  pages =        "574--588",
  month =        dec,
  year =         "2014",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Nov 24 12:48:48 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819114001227",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Peng:2014:BAH,
  author =       "Yuanxi Peng and Manuel Salda{\~n}a and Christopher A.
                 Madill and Xiaofeng Zou and Paul Chow",
  title =        "Benefits of Adding Hardware Support for Broadcast and
                 Reduce Operations in {MPSoC} Applications",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629470",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "MPI has been used as a parallel programming model for
                 supercomputers and clusters and recently in
                 MultiProcessor Systems-on-Chip (MPSoC). One component
                 of MPI is collective communication and its performance
                 is key for certain parallel applications to achieve
                 good speedups. Previous work showed that, with
                 synthetic communication-only benchmarks, communication
                 improvements of up to 11.4-fold and 22-fold for
                 broadcast and reduce operations, respectively, can be
                 achieved by providing hardware support at the network
                 level in a Network-on-Chip (NoC). However, these
                 numbers do not provide a good estimation of the
                 advantage for actual applications, as there are other
                 factors that affect performance besides communications,
                 such as computation. To this end, we extend our
                 previous work by evaluating the impact of hardware
                 support over a set of five parallel application kernels
                 of varying computation-to-communication ratios. By
                 introducing some useful computation to the performance
                 evaluation, we obtain more representative results of
                 the benefits of adding hardware support for broadcast
                 and reduce operations. The experiments show that
                 applications with lower computation-to-communication
                 ratios benefit the most from hardware support as they
                 highly depend on efficient collective communications to
                 achieve better scalability. We also extend our work by
                 doing more analysis on clock frequency, resource usage,
                 power, and energy. The results show reasonable
                 scalability for resource utilization and power in the
                 network interfaces as the number of channels increases
                 and that, even though more power is dissipated in the
                 network interfaces due to the added hardware, the total
                 energy used can still be less if the actual speedup is
                 sufficient. The application kernels are executed in a
                 24-embedded-processor system distributed across four
                 FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Peng:2014:IDI,
  author =       "Yi Peng and Li Chen and Jun-Hai Yong",
  title =        "Importance-Driven Isosurface Decimation for
                 Visualization of Large Simulation Data Based on
                 {OpenCL}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "16",
  number =       "1",
  pages =        "24--32",
  month =        jan # "\slash " # feb,
  year =         "2014",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2013.45",
  ISSN =         "1521-9615",
  ISSN-L =       "1521-9615",
  bibdate =      "Sat Apr 19 10:17:39 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Priimak:2014:FDN,
  author =       "Dmitri Priimak",
  title =        "Finite difference numerical method for the
                 superlattice {Boltzmann} transport equation and case
                 comparison of {CPU(C)} and {GPU(CUDA)}
                 implementations",
  journal =      j-J-COMPUT-PHYS,
  volume =       "278",
  number =       "??",
  pages =        "182--192",
  day =          "1",
  month =        dec,
  year =         "2014",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1016/j.jcp.2014.08.028",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Tue Sep 23 17:27:17 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999114005828",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991/",
}

@Article{Rodrigues:2014:TPS,
  author =       "Christopher Rodrigues and Thomas Jablin and Abdul
                 Dakkak and Wen-Mei Hwu",
  title =        "{Triolet}: a programming system that unifies
                 algorithmic skeleton interfaces for high-performance
                 cluster computing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "247--258",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555268",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional algorithmic skeletons promise a high-level
                 programming interface for distributed-memory clusters
                 that free developers from concerns of task
                 decomposition, scheduling, and communication.
                 Unfortunately, prior distributed functional skeleton
                 frameworks do not deliver performance comparable to
                 that achievable in a low-level distributed programming
                 model such as C with MPI and OpenMP, even when used in
                 concert with high-performance array libraries. There
                 are several causes: they do not take advantage of
                 shared memory on each cluster node; they impose a fixed
                 partitioning strategy on input data; and they have
                 limited ability to fuse loops involving skeletons that
                 produce a variable number of outputs per input. We
                 address these shortcomings in the Triolet programming
                 language through a modular library design that
                 separates concerns of parallelism, loop nesting, and
                 data partitioning. We show how Triolet substantially
                 improves the parallel performance of algorithms
                 involving array traversals and nested, variable-size
                 loops over what is achievable in Eden, a distributed
                 variant of Haskell. We further demonstrate how Triolet
                 can substantially simplify parallel programming
                 relative to C with MPI and OpenMP while achieving
                 23--100\% of its performance on a 128-core cluster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Saillard:2014:PCS,
  author =       "Emmanuelle Saillard and Patrick Carribault and Denis
                 Barthou",
  title =        "{PARCOACH}: Combining static and dynamic validation of
                 {MPI} collective communications",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "425--434",
  month =        nov,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014552204",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/425",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "September 26, 2014",
}

@Article{Samadi:2014:LGU,
  author =       "Mehrzad Samadi and Amir Hormati and Janghaeng Lee and
                 Scott Mahlke",
  title =        "Leveraging {GPUs} using cooperative loop speculation",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579617",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics processing units, or GPUs, provide TFLOPs of
                 additional performance potential in commodity computer
                 systems that frequently go unused by most applications.
                 Even with the emergence of languages such as CUDA and
                 OpenCL, programming GPUs remains a difficult challenge
                 for a variety of reasons, including the inherent
                 algorithmic characteristics and data structure choices
                 used by applications as well as the tedious performance
                 optimization cycle that is necessary to achieve high
                 performance. The goal of this work is to increase the
                 applicability of GPUs beyond CUDA/OpenCL to implicitly
                 data-parallel applications written in C/C++ using
                 speculative parallelization. To achieve this goal, we
                 propose Paragon: a static/dynamic compiler platform to
                 speculatively run possibly data-parallel portions of
                 sequential applications on the GPU while cooperating
                 with the system CPU. For such loops, Paragon utilizes
                 the GPU in an opportunistic way while orchestrating a
                 cooperative relation between the CPU and GPU to reduce
                 the overhead of miss-speculations. Paragon monitors the
                 dependencies for the loops running speculatively on the
                 GPU and nonspeculatively on the CPU using a lightweight
                 distributed conflict detection designed specifically
                 for GPUs, and transfers the execution to the CPU in
                 case a conflict is detected. Paragon resumes the
                 execution on the GPU after the CPU resolves the
                 dependency. Our experiments show that Paragon achieves
                 4x on average and up to 30x speedup compared to unsafe
                 CPU execution with four threads and 7x on average and
                 up to 64x speedup versus sequential execution across a
                 set of sequential but implicitly data-parallel
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Samadi:2014:PPB,
  author =       "Mehrzad Samadi and Davoud Anoushe Jamshidi and
                 Janghaeng Lee and Scott Mahlke",
  title =        "{Paraprox}: pattern-based approximation for data
                 parallel applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "35--50",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541948",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:47 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Approximate computing is an approach where reduced
                 accuracy of results is traded off for increased speed,
                 throughput, or both. Loss of accuracy is not
                 permissible in all computing domains, but there are a
                 growing number of data-intensive domains where the
                 output of programs need not be perfectly correct to
                 provide useful results or even noticeable differences
                 to the end user. These soft domains include multimedia
                 processing, machine learning, and data mining/analysis.
                 An important challenge with approximate computing is
                 transparency to insulate both software and hardware
                 developers from the time, cost, and difficulty of using
                 approximation. This paper proposes a software-only
                 system, Paraprox, for realizing transparent
                 approximation of data-parallel programs that operates
                 on commodity hardware systems. Paraprox starts with a
                 data-parallel kernel implemented using OpenCL or CUDA
                 and creates a parameterized approximate kernel that is
                 tuned at runtime to maximize performance subject to a
                 target output quality (TOQ) that is supplied by the
                 user. Approximate kernels are created by recognizing
                 common computation idioms found in data-parallel
                 programs (e.g., Map, Scatter/Gather, Reduction, Scan,
                 Stencil, and Partition) and substituting approximate
                 implementations in their place. Across a set of 13 soft
                 data-parallel applications with at most 10\% quality
                 degradation, Paraprox yields an average performance
                 gain of 2.7x on a NVIDIA GTX 560 GPU and 2.5x on an
                 Intel Core i7 quad-core processor compared to accurate
                 execution on each platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Samadi:2014:SPS,
  author =       "Mehrzad Samadi and Janghaeng Lee and D. Anoushe
                 Jamshidi and Scott Mahlke and Amir Hormati",
  title =        "Scaling Performance via Self-Tuning Approximation for
                 Graphics Engines",
  journal =      j-TOCS,
  volume =       "32",
  number =       "3",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2631913",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:28 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Approximate computing, where computation accuracy is
                 traded off for better performance or higher data
                 throughput, is one solution that can help data
                 processing keep pace with the current and growing
                 abundance of information. For particular domains, such
                 as multimedia and learning algorithms, approximation is
                 commonly used today. We consider automation to be
                 essential to provide transparent approximation, and we
                 show that larger benefits can be achieved by
                 constructing the approximation techniques to fit the
                 underlying hardware. Our target platform is the GPU
                 because of its high performance capabilities and
                 difficult programming challenges that can be alleviated
                 with proper automation. Our approach --- SAGE ---
                 combines a static compiler that automatically generates
                 a set of CUDA kernels with varying levels of
                 approximation with a runtime system that iteratively
                 selects among the available kernels to achieve speedup
                 while adhering to a target output quality set by the
                 user. The SAGE compiler employs three optimization
                 techniques to generate approximate kernels that exploit
                 the GPU microarchitecture: selective discarding of
                 atomic operations, data packing, and thread fusion.
                 Across a set of machine learning and image processing
                 kernels, SAGE's approximation yields an average of 2.5$
                 \times $ speedup with less than 10\% quality loss
                 compared to the accurate execution on a NVIDIA GTX 560
                 GPU.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Sani:2014:PDF,
  author =       "Ardalan Amiri Sani and Kevin Boos and Shaopu Qin and
                 Lin Zhong",
  title =        "{I/O} paravirtualization at the device file boundary",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "319--332",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541943",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:47 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Paravirtualization is an important I/O virtualization
                 technology since it uniquely provides all of the
                 following benefits: the ability to share the device
                 between multiple VMs, support for legacy devices
                 without virtualization hardware, and high performance.
                 However, existing paravirtualization solutions have one
                 main limitation: they only support one I/O device
                 class, and would require significant engineering effort
                 to support new device classes and features. In this
                 paper, we present Paradice, a solution that vastly
                 simplifies I/O paravirtualization by using a common
                 paravirtualization boundary for various I/O device
                 classes: Unix device files. Using this boundary, the
                 paravirtual drivers simply act as a class-agnostic
                 indirection layer between the application and the
                 actual device driver. We address two fundamental
                 challenges: supporting cross-VM driver memory
                 operations without changes to applications or device
                 drivers and providing fault and device data isolation
                 between guest VMs despite device driver bugs. We
                 implement Paradice for x86, the Xen hypervisor, and the
                 Linux and FreeBSD OSes. Our implementation
                 paravirtualizes various GPUs, input devices, cameras,
                 an audio device, and an Ethernet card for the netmap
                 framework with $ \approx 7700 $ LoC, of which only $
                 \approx 900 $ are device class-specific. Our
                 measurements show that Paradice achieves performance
                 close to native for different devices and applications
                 including netmap, 3D HD games, and OpenCL
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Song:2014:DAT,
  author =       "Sukhyun Song and Jeffrey K. Hollingsworth",
  title =        "Designing and auto-tuning parallel {$3$-D FFT} for
                 computation-communication overlap",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "181--192",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555249",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a method to design and auto-tune a
                 new parallel 3-D FFT code using the non-blocking MPI
                 all-to-all operation. We achieve high performance by
                 optimizing computation-communication overlap. Our code
                 performs fully asynchronous communication without any
                 support from special hardware. We also improve cache
                 performance through loop tiling. To cope with the
                 complex trade-off regarding our optimization
                 techniques, we parameterize our code and auto-tune the
                 parameters efficiently in a large parameter space.
                 Experimental results from two systems confirm that our
                 code achieves a speedup of up to 1.76x over the FFTW
                 library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Steinberger:2014:WTB,
  author =       "Markus Steinberger and Michael Kenzel and Pedro
                 Boechat and Bernhard Kerbl and Mark Dokter and Dieter
                 Schmalstieg",
  title =        "{Whippletree}: task-based scheduling of dynamic
                 workloads on the {GPU}",
  journal =      j-TOG,
  volume =       "33",
  number =       "6",
  pages =        "228:1--228:??",
  month =        nov,
  year =         "2014",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/2661229.2661250",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Fri Nov 14 19:16:26 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "In this paper, we present Whippletree, a novel
                 approach to scheduling dynamic, irregular workloads on
                 the GPU. We introduce a new programming model which
                 offers the simplicity and expressiveness of task-based
                 parallelism while retaining all aspects of the
                 multi-level execution hierarchy essential to unlocking
                 the full potential of a modern GPU. At the same time,
                 our programming model lends itself to efficient
                 implementation on the SIMD-based architecture typical
                 of a current GPU. We demonstrate the practical utility
                 of our model by providing a reference implementation on
                 top of current CUDA hardware. Furthermore, we show that
                 our model compares favorably to traditional approaches
                 in terms of both performance as well as the range of
                 applications that can be covered. We demonstrate the
                 benefits of our model for recursive Reyes rendering,
                 procedural geometry generation and volume rendering
                 with concurrent irradiance caching.",
  acknowledgement = ack-nhfb,
  articleno =    "228",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Steuwer:2014:SHL,
  author =       "Michel Steuwer and Sergei Gorlatch",
  title =        "{SkelCL}: a high-level extension of {OpenCL} for
                 {multi-GPU} systems",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "69",
  number =       "1",
  pages =        "25--33",
  month =        jul,
  year =         "2014",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1213-y",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Sep 10 06:45:05 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=69&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1213-y",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Symeonidou:2014:DRB,
  author =       "Christi Symeonidou and Polyvios Pratikakis and
                 Dimitrios S. Nikolopoulos and Angelos Bilas",
  title =        "Distributed region-based memory allocation and
                 synchronization",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "406--414",
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014552863",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/406",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      nov,
}

@Article{Teixido:2014:MBI,
  author =       "Ivan Teixid{\'o} and Francesc Seb{\'e} and Josep Conde
                 and Francesc Solsona",
  title =        "{MPI}-based implementation of an enhanced algorithm to
                 solve the {LPN} problem in a memory-constrained
                 environment",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "40",
  number =       "5--6",
  pages =        "100--112",
  month =        may,
  year =         "2014",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri May 30 18:33:51 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819114000453",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Thompson:2014:CIC,
  author =       "Elizabeth A. Thompson and Timothy R. Anderson",
  title =        "A {CUDA} implementation of the {Continuous Space
                 Language Model}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "68",
  number =       "1",
  pages =        "65--86",
  month =        apr,
  year =         "2014",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-1023-7",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Sep 10 06:44:53 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=68&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-1023-7",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Tien:2014:EOS,
  author =       "Tsan-Rong Tien and Yi-Ping You",
  title =        "Enabling {OpenCL} support for {GPGPU} in Kernel-based
                 Virtual Machine",
  journal =      j-SPE,
  volume =       "44",
  number =       "5",
  pages =        "483--510",
  month =        may,
  year =         "2014",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.2166",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Wed Sep 10 05:57:32 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/spe.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Software --- Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "22 Nov 2012",
}

@Article{Traff:2014:SPE,
  author =       "Jesper Larsson Tr{\"a}ff and Siegfried Benkner",
  title =        "Selected Papers from {EuroMPI 2012}",
  journal =      j-COMPUTING,
  volume =       "96",
  number =       "4",
  pages =        "259--261",
  month =        apr,
  year =         "2014",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-013-0335-z",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Fri Jun 6 10:07:21 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=4;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-013-0335-z",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Vikas:2014:MGA,
  author =       "Vikas and Nasser Giacaman and Oliver Sinnen",
  title =        "Multiprocessing with {GUI}-awareness using
                 {OpenMP}-like directives in {Java}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "40",
  number =       "2",
  pages =        "69--89",
  month =        feb,
  year =         "2014",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 28 06:47:16 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819113001439",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Wang:2014:IPD,
  author =       "Zheng Wang and Georgios Tournavitis and Bj{\"o}rn
                 Franke and Michael F. P. O'boyle",
  title =        "Integrating profile-driven parallelism detection and
                 machine-learning-based mapping",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579561",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compiler-based auto-parallelization is a much-studied
                 area but has yet to find widespread application. This
                 is largely due to the poor identification and
                 exploitation of application parallelism, resulting in
                 disappointing performance far below that which a
                 skilled expert programmer could achieve. We have
                 identified two weaknesses in traditional parallelizing
                 compilers and propose a novel, integrated approach
                 resulting in significant performance improvements of
                 the generated parallel code. Using profile-driven
                 parallelism detection, we overcome the limitations of
                 static analysis, enabling the identification of more
                 application parallelism, and only rely on the user for
                 final approval. We then replace the traditional
                 target-specific and inflexible mapping heuristics with
                 a machine-learning-based prediction mechanism,
                 resulting in better mapping decisions while automating
                 adaptation to different target architectures. We have
                 evaluated our parallelization strategy on the NAS and
                 SPEC CPU2000 benchmarks and two different multicore
                 platforms (dual quad-core Intel Xeon SMP and
                 dual-socket QS20 Cell blade). We demonstrate that our
                 approach not only yields significant improvements when
                 compared with state-of-the-art parallelizing compilers
                 but also comes close to and sometimes exceeds the
                 performance of manually parallelized codes. On average,
                 our methodology achieves 96\% of the performance of the
                 hand-tuned OpenMP NAS and SPEC parallel benchmarks on
                 the Intel Xeon platform and gains a significant speedup
                 for the IBM Cell platform, demonstrating the potential
                 of profile-guided and machine-learning- based
                 parallelization for complex multicore platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wu:2014:MAG,
  author =       "Xing Wu and Frank Mueller and Scott Pakin",
  title =        "A methodology for automatic generation of executable
                 communication specifications from parallel {MPI}
                 applications",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660249",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Portable parallel benchmarks are widely used for
                 performance evaluation of HPC systems. However, because
                 these are manually produced, they generally represent a
                 greatly simplified view of application behavior,
                 missing the subtle but important-to-performance nuances
                 that may exist in a complete application. This work
                 contributes novel methods to automatically generate
                 highly portable and customizable communication
                 benchmarks from HPC applications. We utilize
                 ScalaTrace, a lossless yet scalable
                 parallel-application tracing framework to collect
                 selected aspects of the run-time behavior of HPC
                 applications, including communication operations and
                 computation time, while abstracting away the details of
                 the computation proper. We subsequently generate
                 benchmarks with nearly identical run-time behavior to
                 the original applications. Results demonstrate that the
                 generated benchmarks are in fact able to preserve the
                 run-time behavior (including both the communication
                 pattern and the execution time) of the original
                 applications. Such automated benchmark generation is
                 without precedent and particularly valuable for
                 proprietary, export-controlled, or classified
                 application codes.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Wu:2014:OFB,
  author =       "Jing Wu and Joseph JaJa and Elias Balaras",
  title =        "An Optimized {FFT}-Based Direct {Poisson} Solver on
                 {CUDA GPUs}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "25",
  number =       "3",
  pages =        "550--559",
  month =        mar,
  year =         "2014",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2013.53",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Mon Aug 25 07:12:16 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Yan:2014:OMB,
  author =       "Xin Yan and Xiaohua Shi and Lina Wang and Haiyan
                 Yang",
  title =        "An {OpenCL} micro-benchmark suite for {GPUs} and
                 {CPUs}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "69",
  number =       "2",
  pages =        "693--713",
  month =        aug,
  year =         "2014",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1112-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Sep 10 06:45:09 MDT 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=69&issue=2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1112-2",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Yang:2014:CNR,
  author =       "Yi Yang and Huiyang Zhou",
  title =        "{CUDA-NP}: realizing nested thread-level parallelism
                 in {GPGPU} applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "93--106",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555254",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallel programs consist of series of code sections
                 with different thread-level parallelism (TLP). As a
                 result, it is rather common that a thread in a parallel
                 program, such as a GPU kernel in CUDA programs, still
                 contains both sequential code and parallel loops. In
                 order to leverage such parallel loops, the latest
                 Nvidia Kepler architecture introduces dynamic
                 parallelism, which allows a GPU thread to start another
                 GPU kernel, thereby reducing the overhead of launching
                 kernels from a CPU. However, with dynamic parallelism,
                 a parent thread can only communicate with its child
                 threads through global memory and the overhead of
                 launching GPU kernels is non-trivial even within GPUs.
                 In this paper, we first study a set of GPGPU benchmarks
                 that contain parallel loops, and highlight that these
                 bench-marks do not have a very high loop count or high
                 degrees of TLP. Consequently, the benefits of
                 leveraging such parallel loops using dynamic
                 parallelism are too limited to offset its overhead. We
                 then present our proposed solution to exploit nested
                 parallelism in CUDA, referred to as CUDA-NP. With
                 CUDA-NP, we initially enable a high number of threads
                 when a GPU program starts, and use control flow to
                 activate different numbers of threads for different
                 code sections. We implemented our proposed CUDA-NP
                 framework using a directive-based compiler approach.
                 For a GPU kernel, an application developer only needs
                 to add OpenMP-like pragmas for parallelizable code
                 sections. Then, our CUDA-NP compiler automatically
                 generates the optimized GPU kernels. It supports both
                 the reduction and the scan primitives, explores
                 different ways to distribute parallel loop iterations
                 into threads, and efficiently manages on-chip resource.
                 Our experiments show that for a set of GPGPU
                 benchmarks, which have already been optimized and
                 contain nested parallelism, our pro-posed CUDA-NP
                 framework further improves the performance by up to
                 6.69 times and 2.18 times on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Yang:2014:HPD,
  author =       "Luobin Yang and Steve C. Chiu and Wei-Keng Liao",
  title =        "High performance data clustering: a comparative
                 analysis of performance for {GPU}, {RASC}, {MPI}, and
                 {OpenMP} implementations",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "70",
  number =       "1",
  pages =        "284--300",
  month =        oct,
  year =         "2014",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-0906-y",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri Feb 13 12:32:14 MST 2015",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=70&issue=1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-0906-y",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Yang:2014:IMP,
  author =       "Xu Yang and Deyuan Guo and Hu He and Haijing Tang and
                 Yanjun Zhang",
  title =        "An Implementation of {Message-Passing Interface} over
                 {VxWorks} for Real-Time Embedded Multi-Core Systems",
  journal =      j-COMP-J,
  volume =       "57",
  number =       "11",
  pages =        "1756--1764",
  month =        nov,
  year =         "2014",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxt152",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Mon Oct 27 08:54:43 MDT 2014",
  bibsource =    "http://comjnl.oxfordjournals.org/content/57/11.toc;
                 http://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://comjnl.oxfordjournals.org/content/57/11/1756",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "January 3, 2014",
}

@Article{Yang:2014:PMI,
  author =       "Chaoran Yang and Wesley Bland and John Mellor-Crummey
                 and Pavan Balaji",
  title =        "Portable, {MPI}-interoperable {Coarray Fortran}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "81--92",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555270",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The past decade has seen the advent of a number of
                 parallel programming models such as Coarray Fortran
                 (CAF), Unified Parallel C, X10, and Chapel. Despite the
                 productivity gains promised by these models, most
                 parallel scientific applications still rely on MPI as
                 their data movement model. One reason for this trend is
                 that it is hard for users to incrementally adopt these
                 new programming models in existing MPI applications.
                 Because each model use its own runtime system, they
                 duplicate resources and are potentially error-prone.
                 Such independent runtime systems were deemed necessary
                 because MPI was considered insufficient in the past to
                 play this role for these languages. The recently
                 released MPI-3, however, adds several new capabilities
                 that now provide all of the functionality needed to act
                 as a runtime, including a much more comprehensive
                 one-sided communication framework. In this paper, we
                 investigate how MPI-3 can form a runtime system for one
                 example programming model, CAF, with a broader goal of
                 enabling a single application to use both MPI and CAF
                 with the highest level of interoperability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Zheng:2014:IMS,
  author =       "Liang Zheng and Huai Zhang and Taras Gerya and Matthew
                 Knepley and David A. Yuen and Yaolin Shi",
  title =        "Implementation of a multigrid solver on a {GPU} for
                 {Stokes} equations with strongly variable viscosity
                 based on {Matlab} and {CUDA}",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "1",
  pages =        "50--60",
  month =        feb,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342013478640",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Mar 14 15:39:59 MDT 2014",
  bibsource =    "http://hpc.sagepub.com/content/28/1.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/1/50.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "March 5, 2013",
}

@Article{Zounmevo:2014:ESC,
  author =       "Judicael A. Zounmevo and Dries Kimpe and Robert Ross
                 and Ahmad Afsahi",
  title =        "Extreme-scale computing services over {MPI}:
                 Experiences, observations and features proposal for
                 next-generation message passing interface",
  journal =      j-IJHPCA,
  volume =       "28",
  number =       "4",
  pages =        "435--449",
  month =        nov,
  year =         "2014",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342014548864",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Feb 13 09:17:23 MST 2015",
  bibsource =    "http://hpc.sagepub.com/content/28/4.toc;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/28/4/435",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "September 10, 2014",
}

@Article{Zounmevo:2014:FRC,
  author =       "Judicael A. Zounmevo and Ahmad Afsahi",
  title =        "A fast and resource-conscious {MPI} message queue
                 mechanism for large-scale jobs",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "30",
  number =       "??",
  pages =        "265--290",
  month =        jan,
  year =         "2014",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Dec 2 16:57:46 MST 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X13001489",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Agosta:2015:OPP,
  author =       "Giovanni Agosta and Alessandro Barenghi and Alessandro
                 {Di Federico} and Gerardo Pelosi",
  title =        "{OpenCL} performance portability for general-purpose
                 computation on graphics processor units: an exploration
                 on cryptographic primitives",
  journal =      j-CCPE,
  volume =       "27",
  number =       "14",
  pages =        "3633--3660",
  day =          "25",
  month =        sep,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3358",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Sep 28 09:32:54 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "29 Aug 2014",
}

@Article{Al-Mouhamed:2015:EAO,
  author =       "Mayez Al-Mouhamed and Ayaz ul Hassan Khan",
  title =        "Exploration of automatic optimisation for {CUDA}
                 programming",
  journal =      j-INT-J-PAR-EMER-DIST-SYS,
  volume =       "30",
  number =       "4",
  pages =        "309--324",
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1080/17445760.2014.953158",
  ISSN =         "1744-5760 (print), 1744-5779 (electronic)",
  ISSN-L =       "1744-5760",
  bibdate =      "Tue Sep 15 07:34:54 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/intjparemerdistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.tandfonline.com/toc/gpaa20/30/4",
  URL =          "http://www.tandfonline.com/doi/abs/10.1080/17445760.2014.953158",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel, Emergent and
                 Distributed Systems: IJPEDS",
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
}

@Article{Amer:2015:MRC,
  author =       "Abdelhalim Amer and Huiwei Lu and Yanjie Wei and Pavan
                 Balaji and Satoshi Matsuoka",
  title =        "{MPI+Threads}: runtime contention and remedies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "239--248",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hybrid MPI+Threads programming has emerged as an
                 alternative model to the ``MPI everywhere'' model to
                 better handle the increasing core density in cluster
                 nodes. While the MPI standard allows multithreaded
                 concurrent communication, such flexibility comes with
                 the cost of maintaining thread safety within the MPI
                 implementation, typically implemented using critical
                 sections. In contrast to previous works that studied
                 the importance of critical-section granularity in MPI
                 implementations, in this paper we investigate the
                 implication of critical-section arbitration on
                 communication performance. We first analyze the MPI
                 runtime when multithreaded concurrent communication
                 takes place on hierarchical memory systems. Our results
                 indicate that the mutex-based approach that most MPI
                 implementations use today can incur performance
                 penalties due to unfair arbitration. We then present
                 methods to mitigate these penalties with a first-come,
                 first-served arbitration and a priority locking scheme
                 that favors threads doing useful work. Through
                 evaluations using several benchmarks and applications,
                 we demonstrate up to 5-fold improvement in
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Balasubramanian:2015:EGL,
  author =       "Raghuraman Balasubramanian and Vinay Gangadhar and
                 Ziliang Guo and Chen-Han Ho and Cherin Joseph and
                 Jaikrishnan Menon and Mario Paulo Drumond and Robin
                 Paul and Sharath Prasad and Pradip Valathol and
                 Karthikeyan Sankaralingam",
  title =        "Enabling {GPGPU} Low-Level Hardware Explorations with
                 {MIAOW}: an Open-Source {RTL} Implementation of a
                 {GPGPU}",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2764908",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphic processing unit (GPU)-based general-purpose
                 computing is developing as a viable alternative to
                 CPU-based computing in many domains. Today's tools for
                 GPU analysis include simulators like GPGPU-Sim,
                 Multi2Sim, and Barra. While useful for modeling
                 first-order effects, these tools do not provide a
                 detailed view of GPU microarchitecture and physical
                 design. Further, as GPGPU research evolves, design
                 ideas and modifications demand detailed estimates of
                 impact on overall area and power. Fueled by this need,
                 we introduce MIAOW (Many-core Integrated Accelerator Of
                 Wisconsin), an open-source RTL implementation of the
                 AMD Southern Islands GPGPU ISA, capable of running
                 unmodified OpenCL-based applications. We present our
                 design motivated by our goals to create a realistic,
                 flexible, OpenCL-compatible GPGPU, capable of emulating
                 a full system. We first explore if MIAOW is realistic
                 and then use four case studies to show that MIAOW
                 enables the following: physical design perspective to
                 ``traditional'' microarchitecture, new types of
                 research exploration, and validation/calibration of
                 simulator-based characterization of hardware. The
                 findings and ideas are contributions in their own
                 right, in addition to MIAOW's utility as a tool for
                 others' research.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Betts:2015:DIV,
  author =       "Adam Betts and Nathan Chong and Alastair F. Donaldson
                 and Jeroen Ketema and Shaz Qadeer and Paul Thomson and
                 John Wickerson",
  title =        "The Design and Implementation of a Verification
                 Technique for {GPU} Kernels",
  journal =      j-TOPLAS,
  volume =       "37",
  number =       "3",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2015",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2743017",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jun 19 05:36:55 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "We present a technique for the formal verification of
                 GPU kernels, addressing two classes of correctness
                 properties: data races and barrier divergence. Our
                 approach is founded on a novel formal operational
                 semantics for GPU kernels termed {\em synchronous,
                 delayed visibility (SDV)\/} semantics, which captures
                 the execution of a GPU kernel by multiple groups of
                 threads. The SDV semantics provides operational
                 definitions for barrier divergence and for both inter-
                 and intra-group data races. We build on the semantics
                 to develop a method for reducing the task of verifying
                 a massively parallel GPU kernel to that of verifying a
                 sequential program. This completely avoids the need to
                 reason about thread interleavings, and allows existing
                 techniques for sequential program verification to be
                 leveraged. We describe an efficient encoding of data
                 race detection and propose a method for automatically
                 inferring the loop invariants that are required for
                 verification. We have implemented these techniques as a
                 practical verification tool, GPUVerify, that can be
                 applied directly to OpenCL and CUDA source code. We
                 evaluate GPUVerify with respect to a set of 162 kernels
                 drawn from public and commercial sources. Our
                 evaluation demonstrates that GPUVerify is capable of
                 efficient, automatic verification of a large number of
                 real-world kernels.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Bukata:2015:SRC,
  author =       "Libor Bukata and Premysl Sucha and Zdenek
                 Hanz{\'a}lek",
  title =        "Solving the Resource Constrained Project Scheduling
                 Problem using the parallel Tabu Search designed for the
                 {CUDA} platform",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "77",
  number =       "??",
  pages =        "58--68",
  month =        mar,
  year =         "2015",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon Mar 2 12:05:20 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731514002226",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315/",
}

@Article{Busa:2015:CCO,
  author =       "J{\'a}n {Busa, Jr.} and J{\'a}n Busa and Shura Hayryan
                 and Chin-Kun Hu and Ming-Chya Wu",
  title =        "{CAVE-CL}: an {OpenCL} version of the package for
                 detection and quantitative analysis of internal
                 cavities in a system of overlapping balls: Application
                 to proteins",
  journal =      j-COMP-PHYS-COMM,
  volume =       "190",
  number =       "??",
  pages =        "224--227",
  month =        may,
  year =         "2015",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Mar 4 08:31:43 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465514004378",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Carretero:2015:AMM,
  author =       "Jesus Carretero and Javier Garcia-Blas and David E.
                 Singh and Florin Isaila and Alexey Lastovetsky and
                 Thomas Fahringer and Radu Prodan and Peter Zangerl and
                 Christi Symeonidou and Afshin Fassihi and Horacio
                 P{\'e}rez-S{\'a}nchez",
  title =        "Acceleration of {MPI} mechanisms for sustainable {HPC}
                 applications",
  journal =      j-SUPERFRI,
  volume =       "2",
  number =       "2",
  pages =        "28--45",
  month =        "????",
  year =         "2015",
  CODEN =        "????",
  ISSN =         "2409-6008 (print), 2313-8734 (electronic)",
  bibdate =      "Sat Nov 11 07:15:27 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/superfri.bib",
  URL =          "http://superfri.org/superfri/article/view/35",
  acknowledgement = ack-nhfb,
  fjournal =     "Supercomputing Frontiers and Innovations",
  journal-URL =  "http://superfri.org/superfri/issue/archive",
}

@Article{Casanova:2015:SMA,
  author =       "Henri Casanova and Fr{\'e}d{\'e}ric Desprez and George
                 S. Markomanolis and Fr{\'e}d{\'e}ric Suter",
  title =        "Simulation of {MPI} applications with time-independent
                 traces",
  journal =      j-CCPE,
  volume =       "27",
  number =       "5",
  pages =        "1145--1168",
  day =          "10",
  month =        apr,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3278",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Jul 25 19:54:07 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "11 Apr 2014",
}

@Article{Casanova:2015:TMS,
  author =       "Henri Casanova and Anshul Gupta and Fr{\'e}d{\'e}ric
                 Suter",
  title =        "Toward More Scalable Off-Line Simulations of {MPI}
                 Applications",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "25",
  number =       "3",
  pages =        "1541002",
  month =        sep,
  year =         "2015",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626415410029",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Tue May 29 09:05:25 MDT 2018",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Cercos-Pita:2015:ANF,
  author =       "J. L. Cercos-Pita",
  title =        "{AQUAgpusph}, a new free {$3$D} {SPH} solver
                 accelerated with {OpenCL}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "192",
  number =       "??",
  pages =        "295--312",
  month =        jul,
  year =         "2015",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Apr 21 11:56:04 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515000909",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Chabbi:2015:BEP,
  author =       "Milind Chabbi and Wim Lavrijsen and Wibe de Jong and
                 Koushik Sen and John Mellor-Crummey and Costin Iancu",
  title =        "Barrier elision for production parallel programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "109--119",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large scientific code bases are often composed of
                 several layers of runtime libraries, implemented in
                 multiple programming languages. In such situation,
                 programmers often choose conservative synchronization
                 patterns leading to suboptimal performance. In this
                 paper, we present context-sensitive dynamic
                 optimizations that elide barriers redundant during the
                 program execution. In our technique, we perform data
                 race detection alongside the program to identify
                 redundant barriers in their calling contexts; after an
                 initial learning, we start eliding all future instances
                 of barriers occurring in the same calling context. We
                 present an automatic on-the-fly optimization and a
                 multi-pass guided optimization. We apply our techniques
                 to NWChem--a 6 million line computational chemistry
                 code written in C/C++/Fortran that uses several runtime
                 libraries such as Global Arrays, ComEx, DMAPP, and MPI.
                 Our technique elides a surprisingly high fraction of
                 barriers (as many as 63\%) in production runs. This
                 redundancy elimination translates to application
                 speedups as high as 14\% on 2048 cores. Our techniques
                 also provided valuable insight about the application
                 behavior, later used by NWChem developers. Overall, we
                 demonstrate the value of holistic context-sensitive
                 analyses that consider the domain science in
                 conjunction with the associated runtime software
                 stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Cho:2015:OAO,
  author =       "Myeongjin Cho and Youngsun Han and Minseong Kim and
                 Seon Wook Kim",
  title =        "{O2WebCL}: an automatic {OpenCL-to-WebCL} translator
                 for high performance web computing",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "6",
  pages =        "2050--2065",
  month =        jun,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1260-4",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Aug 8 12:23:11 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/6;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1260-4",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Couder-Castaneda:2015:PCM,
  author =       "C. Couder-Casta{\~n}eda and H. Barrios-Pi{\~n}a and I.
                 Gitler and M. Arroyo",
  title =        "Performance of a Code Migration for the Simulation of
                 Supersonic Ejector Flow to {SMP}, {MIC}, and {GPU}
                 Using {OpenMP}, {OpenMP+LEO}, and {OpenACC}
                 Directives",
  journal =      j-SCI-PROG,
  volume =       "2015",
  number =       "??",
  pages =        "739107:1--739107:20",
  month =        "????",
  year =         "2015",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.1155/2015/739107",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 20 07:53:44 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  URL =          "https://www.hindawi.com/journals/sp/2015/739107/",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "https://www.hindawi.com/journals/sp/",
  journalabr =   "Sci. Prog",
}

@Article{Ebrahimirad:2015:EAS,
  author =       "Vahid Ebrahimirad and Maziar Goudarzi and Aboozar
                 Rajabi",
  title =        "Energy-Aware Scheduling for Precedence-Constrained
                 Parallel Virtual Machines in Virtualized Data Centers",
  journal =      j-J-GRID-COMP,
  volume =       "13",
  number =       "2",
  pages =        "233--253",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10723-015-9327-x",
  ISSN =         "1570-7873 (print), 1572-9184 (electronic)",
  ISSN-L =       "1570-7873",
  bibdate =      "Sat Aug 8 12:08:29 MDT 2015",
  bibsource =    "http://link.springer.com/journal/10723/13/2;
                 http://www.math.utah.edu/pub/tex/bib/jgridcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "http://link.springer.com/article/10.1007/s10723-015-9327-x",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Grid Computing",
  journal-URL =  "http://link.springer.com/journal/10723",
}

@Article{Emani:2015:CDM,
  author =       "Murali Krishna Emani and Michael O'Boyle",
  title =        "Celebrating diversity: a mixture of experts approach
                 for runtime mapping in dynamic environments",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "499--508",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Matching program parallelism to platform parallelism
                 using thread selection is difficult when the
                 environment and available resources dynamically change.
                 Existing compiler or runtime approaches are typically
                 based on a one-size fits all policy. There is little
                 ability to either evaluate or adapt the policy when
                 encountering new external workloads or hardware
                 resources. This paper focuses on selecting the best
                 number of threads for a parallel application in dynamic
                 environments. It develops a new scheme based on a
                 mixture of experts approach. It learns online which, of
                 a number of existing policies, or experts, is best
                 suited for a particular environment without having to
                 try out each policy. It does this by using a novel
                 environment predictor as a proxy for the quality of an
                 expert thread selection policy. Additional expert
                 policies can easily be added and are selected only when
                 appropriate. We evaluate our scheme in environments
                 with varying external workloads and hardware
                 resources.We then consider the case when workloads use
                 affinity scheduling or are themselves adaptive and show
                 that our approach, in all cases, outperforms existing
                 schemes and surprisingly improves workload performance.
                 On average, we improve 1.66x over OpenMP default, 1.34x
                 over an online scheme, 1.25x over an offline policy and
                 1.2x over a state-of-art analytic model. Determining
                 the right number and type of experts is an open problem
                 and our initial analysis shows that adding more experts
                 improves accuracy and performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Fabeiro:2015:AGO,
  author =       "Jorge F. Fabeiro and Diego Andrade and Basilio B.
                 Fraguela and Ram{\'o}n Doallo",
  title =        "Automatic Generation of Optimized {OpenCL} Codes Using
                 {OCLoptimizer}",
  journal =      j-COMP-J,
  volume =       "58",
  number =       "11",
  pages =        "3057--3073",
  month =        nov,
  year =         "2015",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxv038",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Tue Nov 17 08:06:33 MST 2015",
  bibsource =    "http://comjnl.oxfordjournals.org/content/58/11.toc;
                 http://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "June 2, 2015",
}

@Article{Fang:2015:EVD,
  author =       "Jianbin Fang and Ana Lucia Varbanescu and Xiangke Liao
                 and Henk Sips",
  title =        "Evaluating vector data type usage in {OpenCL}
                 kernels",
  journal =      j-CCPE,
  volume =       "27",
  number =       "17",
  pages =        "4586--4602",
  day =          "10",
  month =        dec,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3424",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:20 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "23 Oct 2014",
}

@Article{Ferretti:2015:MCH,
  author =       "Marco Ferretti and Mirto Musci and Luigi Santangelo",
  title =        "{MPI--CMS}: a hybrid parallel approach to geometrical
                 motif search in proteins",
  journal =      j-CCPE,
  volume =       "27",
  number =       "18",
  pages =        "5500--5516",
  day =          "25",
  month =        dec,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3588",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:20 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 Aug 2015",
}

@Article{Filipovic:2015:OCC,
  author =       "Jir{\'\i} Filipovic and Mat{\'u}s Madzin and Jan
                 Fousek and Ludek Matyska",
  title =        "Optimizing {CUDA} code by kernel fusion: application
                 on {BLAS}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "10",
  pages =        "3934--3957",
  month =        oct,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-015-1483-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Sep 29 10:07:24 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/10;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-015-1483-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Galizia:2015:MCL,
  author =       "Antonella Galizia and Daniele D'Agostino and Andrea
                 Clematis",
  title =        "An {MPI--CUDA} library for image processing on {HPC}
                 architectures",
  journal =      j-J-COMPUT-APPL-MATH,
  volume =       "273",
  number =       "??",
  pages =        "414--427",
  day =          "1",
  month =        jan,
  year =         "2015",
  CODEN =        "JCAMDI",
  ISSN =         "0377-0427 (print), 1879-1778 (electronic)",
  ISSN-L =       "0377-0427",
  bibdate =      "Sat Feb 25 13:34:46 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputapplmath2015.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0377042714002374",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational and Applied Mathematics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03770427",
}

@Article{Garain:2015:CCF,
  author =       "Sudip Garain and Dinshaw S. Balsara and John Reid",
  title =        "Comparing {Coarray Fortran (CAF)} with {MPI} for
                 several structured mesh {PDE} applications",
  journal =      j-J-COMPUT-PHYS,
  volume =       "297",
  number =       "??",
  pages =        "237--253",
  day =          "15",
  month =        sep,
  year =         "2015",
  CODEN =        "JCTPAH",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Sat Jul 25 09:25:55 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/jcomputphys2015.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S002199911500354X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991/",
}

@Article{Gidra:2015:NGC,
  author =       "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and
                 Marc Shapiro and Nhan Nguyen",
  title =        "{NumaGiC}: a Garbage Collector for Big Data on Big
                 {NUMA} Machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "661--673",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694361",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On contemporary cache-coherent Non-Uniform Memory
                 Access (ccNUMA) architectures, applications with a
                 large memory footprint suffer from the cost of the
                 garbage collector (GC), because, as the GC scans the
                 reference graph, it makes many remote memory accesses,
                 saturating the interconnect between memory nodes. We
                 address this problem with NumaGiC, a GC with a
                 mostly-distributed design. In order to maximise memory
                 access locality during collection, a GC thread avoids
                 accessing a different memory node, instead notifying a
                 remote GC thread with a message; nonetheless, NumaGiC
                 avoids the drawbacks of a pure distributed design,
                 which tends to decrease parallelism. We compare NumaGiC
                 with Parallel Scavenge and NAPS on two different ccNUMA
                 architectures running on the Hotspot Java Virtual
                 Machine of OpenJDK 7. On Spark and Neo4j, two
                 industry-strength analytics applications, with heap
                 sizes ranging from 160GB to 350GB, and on SPECjbb2013
                 and SPECjbb2005, ourgc improves overall performance by
                 up to 45\% over NAPS (up to 94\% over Parallel
                 Scavenge), and increases the performance of the
                 collector itself by up to 3.6x over NAPS (up to 5.4x
                 over Parallel Scavenge).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Havran:2015:EBT,
  author =       "Vlastimil Havran and Petr Egert",
  title =        "Extensions to bidirectional texture function
                 compression with multi-level vector quantization in
                 {OpenCL}",
  journal =      j-COMPUTERS-AND-GRAPHICS,
  volume =       "48",
  number =       "??",
  pages =        "1--10",
  month =        may,
  year =         "2015",
  CODEN =        "COGRD2",
  ISSN =         "0097-8493 (print), 1873-7684 (electronic)",
  ISSN-L =       "0097-8493",
  bibdate =      "Fri Apr 24 17:46:30 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compgraph.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0097849315000060",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers \& Graphics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00978493/",
}

@Article{Hoefler:2015:RMA,
  author =       "Torsten Hoefler and James Dinan and Rajeev Thakur and
                 Brian Barrett and Pavan Balaji and William Gropp and
                 Keith Underwood",
  title =        "Remote Memory Access Programming in {MPI-3}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2780584",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The Message Passing Interface (MPI) 3.0 standard,
                 introduced in September 2012, includes a significant
                 update to the one-sided communication interface, also
                 known as remote memory access (RMA). In particular, the
                 interface has been extended to better support popular
                 one-sided and global-address-space parallel programming
                 models to provide better access to hardware performance
                 features and enable new data-access modes. We present
                 the new RMA interface and specify formal axiomatic
                 models for data consistency and access semantics. Such
                 models can help users reason about details of the
                 semantics that are hard to extract from the English
                 prose in the standard. It also fosters the development
                 of tools and compilers, enabling them to automatically
                 analyze, optimize, and debug RMA programs.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jaaskelainen:2015:PPP,
  author =       "Pekka J{\"a}{\"a}skel{\"a}inen and Carlos {S{\'a}nchez
                 de La Lama} and Erik Schnetter and Kalle Raiskila and
                 Jarmo Takala and Heikki Berg",
  title =        "{pocl}: A Performance-Portable {OpenCL}
                 Implementation",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "43",
  number =       "5",
  pages =        "752--785",
  month =        oct,
  year =         "2015",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-014-0320-y",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Aug 8 12:34:17 MDT 2015",
  bibsource =    "http://link.springer.com/journal/10766/43/5;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-014-0320-y",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Jaeger:2015:FGD,
  author =       "Julien Jaeger and Patrick Carribault and Marc
                 P{\'e}rache",
  title =        "Fine-grain data management directory for {OpenMP 4.0}
                 and {OpenACC}",
  journal =      j-CCPE,
  volume =       "27",
  number =       "6",
  pages =        "1528--1539",
  day =          "25",
  month =        apr,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3352",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Jul 25 19:54:07 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "13 Aug 2014",
}

@Article{Jo:2015:ALM,
  author =       "Gangwon Jo and Jeongho Nah and Jun Lee and Jungwon Kim
                 and Jaejin Lee",
  title =        "Accelerating {LINPACK} with {MPI-OpenCL} on Clusters
                 of Multi-{GPU} Nodes",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "26",
  number =       "7",
  pages =        "1814--1825",
  month =        jul,
  year =         "2015",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2014.2321742",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Mon Aug 3 11:58:51 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2015/07/06846313-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2015/07/06846313-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Kaliman:2015:SNU,
  author =       "Ilya A. Kaliman and Lyudmila V. Slipchenko",
  title =        "Software News and Updates: Hybrid {MPI\slash OpenMP}
                 parallelization of the effective fragment potential
                 method in the {{\tt libefp}} software library",
  journal =      j-J-COMPUT-CHEM,
  volume =       "36",
  number =       "2",
  pages =        "129--135",
  day =          "15",
  month =        jan,
  year =         "2015",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.23772",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Fri Mar 6 15:50:38 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0192-8651",
  onlinedate =   "13 Nov 2014",
}

@Article{Karami:2015:SPA,
  author =       "Ali Karami and Farshad Khunjush and Seyyed Ali
                 Mirsoleimani",
  title =        "A statistical performance analyzer framework for
                 {OpenCL} kernels on {Nvidia GPUs}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "8",
  pages =        "2900--2921",
  month =        aug,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1338-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Aug 8 12:23:12 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/8;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1338-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Kim:2015:OBU,
  author =       "Jungwon Kim and Seyong Lee and Jeffrey S. Vetter",
  title =        "An {OpenACC}-based unified programming model for
                 multi-accelerator systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "257--258",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes a novel SPMD programming model of
                 OpenACC. Our model integrates the different
                 granularities of parallelism from vector-level
                 parallelism to node-level parallelism into a single,
                 unified model based on OpenACC. It allows programmers
                 to write programs for multiple accelerators using a
                 uniform programming model whether they are in shared or
                 distributed memory systems. We implement a prototype of
                 our model and evaluate its performance with a GPU-based
                 supercomputer using three benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@InProceedings{Klawonn:2015:HMO,
  author =       "Axel Klawonn and Martin Lanser and Oliver Rheinbach
                 and Holger Stengel and Gerhard Wellein",
  title =        "Hybrid {MPI\slash OpenMP} Parallelization in
                 {FETI--DP} Methods",
  crossref =     "Mehl:2015:RTC",
  volume =       "105",
  pages =        "67--84",
  year =         "2015",
  DOI =          "https://doi.org/10.1007/978-3-319-22997-3_4",
  bibdate =      "Sat Dec 12 10:22:10 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/chapter/10.1007/978-3-319-22997-3_4/",
  acknowledgement = ack-nhfb,
  book-DOI =     "https://doi.org/10.1007/978-3-319-22997-3",
  book-URL =     "http://www.springerlink.com/content/978-3-319-22997-3",
}

@Article{Komura:2015:OPS,
  author =       "Yukihiro Komura",
  title =        "{OpenACC} programs of the {Swendsen--Wang}
                 multi-cluster spin flip algorithm",
  journal =      j-COMP-PHYS-COMM,
  volume =       "197",
  number =       "??",
  pages =        "298--303",
  month =        dec,
  year =         "2015",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Nov 11 06:05:22 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515003197",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Kouzinopoulos:2015:MSM,
  author =       "Charalampos S. Kouzinopoulos and Panagiotis D.
                 Michailidis and Konstantinos G. Margaritis",
  title =        "Multiple String Matching on a {GPU} using {CUDAs}",
  journal =      j-SCPE,
  volume =       "16",
  number =       "2",
  pages =        "121--138",
  month =        "????",
  year =         "2015",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:46 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1085",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Kovanen:2015:TAC,
  author =       "Janne Kovanen and Tapani Sarjakoski",
  title =        "Tilewise Accumulated Cost Surface Computation with
                 Graphics Processing Units",
  journal =      j-TSAS,
  volume =       "1",
  number =       "2",
  pages =        "8:1--8:27",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2803172",
  ISSN =         "2374-0353 (print), 2374-0361 (electronic)",
  ISSN-L =       "2374-0353",
  bibdate =      "Thu Jun 15 14:51:01 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tsas.bib",
  URL =          "http://dl.acm.org/citation.cfm?id=2803172",
  abstract =     "Accumulated cost surfaces are used in a variety of
                 fields that employ spatial analysis. Several algorithms
                 have been suggested in the past for solving them
                 efficiently or with minimal errors. Meanwhile, a new
                 wave on the technological frontier has brought about
                 general-purpose computing on GPUs. In this article, we
                 describe how accumulated cost surfaces can be solved
                 with CUDA. To verify the performance of our solution,
                 we performed an experimental comparison against
                 implementations run on a CPU. Our results with
                 realistic cost models indicate that the move to GPUs
                 can engender a speed-up of an order of magnitude.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Spatial Algorithms and Systems
                 (TSAS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J1514",
}

@Article{Kramer:2015:SET,
  author =       "Stephan C. Kramer and Johannes Hagemann",
  title =        "{SciPAL}: Expression Templates and Composition Closure
                 Objects for High Performance Computational Physics with
                 {CUDA} and {OpenMP}",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686886",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present SciPAL (scientific parallel algorithms
                 library), a C ++-based, hardware-independent
                 open-source library. Its core is a domain-specific
                 embedded language for numerical linear algebra. The
                 main fields of application are finite element
                 simulations, coherent optics and the solution of
                 inverse problems. Using SciPAL algorithms can be stated
                 in a mathematically intuitive way in terms of matrix
                 and vector operations. Existing algorithms can easily
                 be adapted to GPU-based computing by proper template
                 specialization. Our library is compatible with the
                 finite element library deal .II and provides a port of
                 deal.II's most frequently used linear algebra classes
                 to CUDA (NVidia's extension of the programming
                 languages C and C ++ for programming their GPUs).
                 SciPAL 's operator-based API for BLAS operations
                 particularly aims at simplifying the usage of NVidia's
                 CUBLAS. For non-BLAS array arithmetic SciPAL 's
                 expression templates are able to generate CUDA kernels
                 at compile time. We demonstrate the benefits of SciPAL
                 using the iterative principal component analysis as
                 example which is the core algorithm for the
                 spike-sorting problem in neuroscience.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Laguna:2015:DPF,
  author =       "Ignacio Laguna and Dong H. Ahn and Bronis R. de
                 Supinski and Saurabh Bagchi and Todd Gamblin",
  title =        "Diagnosis of Performance Faults in {LargeScale} {MPI}
                 Applications via Probabilistic Progress-Dependence
                 Inference",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "26",
  number =       "5",
  pages =        "1280--1289",
  month =        may,
  year =         "2015",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2014.2314100",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jun 4 19:34:11 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://csdl.computer.org/csdl/trans/td/2015/05/06803050-abs.html",
  abstract-URL = "http://csdl.computer.org/csdl/trans/td/2015/05/06803050-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Lashgar:2015:CSR,
  author =       "Ahmad Lashgar and Ebad Salehi and Amirali Baniasadi",
  title =        "A Case Study in Reverse Engineering {GPGPUs}:
                 Outstanding Memory Handling Resources",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "15--21",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927968",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "During recent years, GPU micro-architectures have
                 changed dramatically, evolving into powerful many-core
                 deep-multithreaded platforms for parallel workloads.
                 While important micro-architectural modifications
                 continue to appear in every new generation of these
                 processors, unfortunately, little is known about the
                 details of these innovative designs. One of the key
                 questions in understanding GPUs is how they deal with
                 outstanding memory misses. Our goal in this study is to
                 find answers to this question. To this end, we develop
                 a set of micro-benchmarks in CUDA to understand the
                 outstanding memory requests handling resources.
                 Particularly, we study two NVIDIA GPGPUs (Fermi and
                 Kepler) and estimate their capability in handling
                 outstanding memory requests. We show that Kepler can
                 issue nearly 32X higher number of outstanding memory
                 requests, compared to Fermi. We explain this
                 enhancement by Kepler's architectural modifications in
                 outstanding memory request handling resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Lee:2015:GCE,
  author =       "J. Lee and D. H. Woo and H. Kim and M. Azimi",
  title =        "{GREEN} Cache: Exploiting the Disciplined Memory Model
                 of {OpenCL} on {GPUs}",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "64",
  number =       "11",
  pages =        "3167--3180",
  month =        nov,
  year =         "2015",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2015.2395435",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Oct 13 06:51:52 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Lee:2015:OPE,
  author =       "Joo Hwan Lee and Nimit Nigania and Hyesoon Kim and
                 Kaushik Patel and Hyojong Kim",
  title =        "{OpenCL} Performance Evaluation on Modern Multicore
                 {CPUs}",
  journal =      j-SCI-PROG,
  volume =       "2015",
  number =       "??",
  pages =        "859491:1--859491:20",
  month =        "????",
  year =         "2015",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.1155/2015/859491",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 20 07:53:44 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  URL =          "https://www.hindawi.com/journals/sp/2015/859491/",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "https://www.hindawi.com/journals/sp/",
  journalabr =   "Sci. Prog",
}

@Article{Li:2015:AMR,
  author =       "Jiansen Li and Jianqi Sun and Ying Song and Jun Zhao",
  title =        "Accelerating {MRI} reconstruction via
                 three-dimensional dual-dictionary learning using
                 {CUDA}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "7",
  pages =        "2381--2396",
  month =        jul,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-015-1386-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Aug 8 12:23:11 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/7;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-015-1386-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Lidbury:2015:MCC,
  author =       "Christopher Lidbury and Andrei Lascu and Nathan Chong
                 and Alastair F. Donaldson",
  title =        "Many-core compiler fuzzing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "65--76",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737986",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address the compiler correctness problem for
                 many-core systems through novel applications of fuzz
                 testing to OpenCL compilers. Focusing on two methods
                 from prior work, random differential testing and
                 testing via equivalence modulo inputs (EMI), we present
                 several strategies for random generation of
                 deterministic, communicating OpenCL kernels, and an
                 injection mechanism that allows EMI testing to be
                 applied to kernels that otherwise exhibit little or no
                 dynamically-dead code. We use these methods to conduct
                 a large, controlled testing campaign with respect to 21
                 OpenCL (device, compiler) configurations, covering a
                 range of CPU, GPU, accelerator, FPGA and emulator
                 implementations. Our study provides independent
                 validation of claims in prior work related to the
                 effectiveness of random differential testing and EMI
                 testing, proposes novel methods for lifting these
                 techniques to the many-core setting and reveals a
                 significant number of OpenCL compiler bugs in
                 commercial implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Lopez:2015:PBV,
  author =       "Hugo A. L{\'o}pez and Eduardo R. B. Marques and
                 Francisco Martins and Nicholas Ng and C{\'e}sar Santos
                 and Vasco Thudichum Vasconcelos and Nobuko Yoshida",
  title =        "Protocol-based verification of message-passing
                 parallel programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "280--298",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814302",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present ParTypes, a type-based methodology for the
                 verification of Message Passing Interface (MPI)
                 programs written in the C programming language. The aim
                 is to statically verify programs against protocol
                 specifications, enforcing properties such as fidelity
                 and absence of deadlocks. We develop a protocol
                 language based on a dependent type system for
                 message-passing parallel programs, which includes
                 various communication operators, such as point-to-point
                 messages, broadcast, reduce, array scatter and gather.
                 For the verification of a program against a given
                 protocol, the protocol is first translated into a
                 representation read by VCC, a software verifier for C.
                 We successfully verified several MPI programs in a
                 running time that is independent of the number of
                 processes or other input parameters. This contrasts
                 with alternative techniques, notably model checking and
                 runtime verification, that suffer from the
                 state-explosion problem or that otherwise depend on
                 parameters to the program itself. We experimentally
                 evaluated our approach against state-of-the-art tools
                 for MPI to conclude that our approach offers a scalable
                 solution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Lorentz:2015:AMS,
  author =       "Istvan Lorentz and Razvan Andonie and Levente
                 Fabry-Asztalos",
  title =        "Accelerating Molecular Structure Determination Based
                 on Inter-Atomic Distances Using {OpenCL}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "26",
  number =       "12",
  pages =        "3250--3263",
  month =        dec,
  year =         "2015",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2014.2385712",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Nov 17 06:28:07 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://csdl.computer.org/csdl/trans/td/2015/12/06995963-abs.html",
  abstract-URL = "http://csdl.computer.org/csdl/trans/td/2015/12/06995963-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Lotfi:2015:AAC,
  author =       "Atieh Lotfi and Abbas Rahimi and Luca Benini and
                 Rajesh K. Gupta",
  title =        "Aging-Aware Compilation for {GP-GPUs}",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2778984",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "General-purpose graphic processing units (GP-GPUs)
                 offer high computational throughput using thousands of
                 integrated processing elements (PEs). These PEs are
                 stressed during workload execution, and negative bias
                 temperature instability (NBTI) adversely affects their
                 reliability by introducing new delay-induced faults.
                 However, the effect of these delay variations is not
                 uniformly spread across the PEs: some are affected more
                 --- hence less reliable --- than others. This variation
                 causes significant reduction in the lifetime of GP-GPU
                 parts. In this article, we address the problem of
                 ``wear leveling'' across processing units to mitigate
                 lifetime uncertainty in GP-GPUs. We propose innovations
                 in the static compiled code that can improve healing in
                 PEs and stream cores (SCs) based on their degradation
                 status. PE healing is a fine-grained very long
                 instruction word (VLIW) slot assignment scheme that
                 balances the stress of instructions across the PEs
                 within an SC. SC healing is a coarse-grained workload
                 allocation scheme that distributes workload across SCs
                 in GP-GPUs. Both schemes share a common property: they
                 adaptively shift workload from less reliable units to
                 more reliable units, either spatially or temporally.
                 These software schemes are based on online calibration
                 with NBTI monitoring that equalizes the expected
                 lifetime of PEs and SCs by regenerating adaptive
                 compiled codes to respond to the specific health state
                 of the GP-GPUs. We evaluate the effectiveness of the
                 proposed schemes for various OpenCL kernels from the
                 AMD APP SDK on Evergreen and Southern Island GPU
                 architectures. The aging-aware healthy kernels
                 generated by the PE (or SC) healing scheme reduce
                 NBTI-induced voltage threshold shift by 30\% (77\% in
                 the case of SCs), with no (moderate) performance
                 penalty compared to the naive kernels.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Markidis:2015:OAN,
  author =       "Stefano Markidis and Jing Gong and Michael Schliephake
                 and Erwin Laure and Alistair Hart and David Henty and
                 Katherine Heisey and Paul Fischer",
  title =        "{OpenACC} acceleration of the {Nek5000} spectral
                 element code",
  journal =      j-IJHPCA,
  volume =       "29",
  number =       "3",
  pages =        "311--319",
  month =        aug,
  year =         "2015",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Apr 4 14:51:30 MDT 2017",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Martin:2015:EPM,
  author =       "Gonzalo Mart{\'\i}n and David E. Singh and
                 Maria-Cristina Marinescu and Jes{\'u}s Carretero",
  title =        "Enhancing the performance of malleable {MPI}
                 applications by using performance-aware dynamic
                 reconfiguration",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "46",
  number =       "??",
  pages =        "60--77",
  month =        jul,
  year =         "2015",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Jun 17 11:37:27 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819115000642",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Mehta:2015:MTP,
  author =       "Kshitij Mehta and Edgar Gabriel",
  title =        "Multi-Threaded Parallel {I/O} for {OpenMP}
                 Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "43",
  number =       "2",
  pages =        "286--309",
  month =        apr,
  year =         "2015",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-014-0306-9",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Aug 8 12:34:16 MDT 2015",
  bibsource =    "http://link.springer.com/journal/10766/43/2;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-014-0306-9",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Momeni:2015:EEO,
  author =       "Amir Momeni and Hamed Tabkhi and Yash Ukidave and
                 Gunar Schirner and David Kaeli",
  title =        "Exploring the Efficiency of the {OpenCL} Pipe Semantic
                 on an {FPGA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "52--57",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927974",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper evaluates the potential benefits of
                 leveraging the OpenCL Pipe semantic to accelerate
                 FPGA-based applications. Our work focuses on streaming
                 applications in the embedded vision processing domain.
                 These applications are well-suited for concurrent
                 kernel execution support and inter-kernel communication
                 enabled by using OpenCL pipes. We analyze the impact of
                 multiple design factors and application optimizations
                 to improve the performance offered by OpenCL Pipes. The
                 design tradeoffs considered include: the execution
                 granularity across kernels, the rate and volume of data
                 transfers, and the Pipe size. For our case study
                 application of vision ow, we observe a 2.8X increase in
                 throughput for tuned pipelined kernels, as compared to
                 non-pipelined execution. In addition, we propose a
                 novel mechanism to efficiently capture the behavior for
                 2-dimensional (2D) vision algorithms to benefit
                 Pipe-based execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Muddukrishna:2015:LAT,
  author =       "Ananya Muddukrishna and Peter A. Jonsson and Mats
                 Brorsson",
  title =        "Locality-Aware Task Scheduling and Data Distribution
                 for {OpenMP} Programs on {NUMA} Systems and Manycore
                 Processors",
  journal =      j-SCI-PROG,
  volume =       "2015",
  number =       "??",
  pages =        "981759:1--981759:16",
  month =        "????",
  year =         "2015",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.1155/2015/981759",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 20 07:53:44 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  URL =          "https://www.hindawi.com/journals/sp/2015/981759/",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "https://www.hindawi.com/journals/sp/",
  journalabr =   "Sci. Prog",
}

@Article{Muralidharan:2015:COP,
  author =       "Saurav Muralidharan and Michael Garland and Bryan
                 Catanzaro and Albert Sidelnik and Mary Hall",
  title =        "A collection-oriented programming model for
                 performance portability",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "263--264",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes Surge, a collection-oriented
                 programming model that enables programmers to compose
                 parallel computations using nested high-level data
                 collections and operators. Surge exposes a code
                 generation interface, decoupled from the core
                 computation, that enables programmers and autotuners to
                 easily generate multiple implementations of the same
                 computation on various parallel architectures such as
                 multi-core CPUs and GPUs. By decoupling computations
                 from architecture-specific implementation, programmers
                 can target multiple architectures more easily, and
                 generate a search space that facilitates optimization
                 and customization for specific architectures. We
                 express in Surge four real-world benchmarks from
                 domains such as sparse linear-algebra and machine
                 learning and from the same performance-portable
                 specification, generate OpenMP and CUDA C++
                 implementations. Surge generates efficient, scalable
                 code which achieves up to 1.32x speedup over
                 handcrafted, well-optimized CUDA code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Obrecht:2015:PEO,
  author =       "Christian Obrecht and Bernard Tourancheau and
                 Fr{\'e}d{\'e}ric Kuznik",
  title =        "Performance Evaluation of an {OpenCL} Implementation
                 of the {Lattice Boltzmann Method} on the {Intel Xeon
                 Phi}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "25",
  number =       "3",
  pages =        "1541001",
  month =        sep,
  year =         "2015",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626415410017",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Tue May 29 09:05:25 MDT 2018",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Orr:2015:SUR,
  author =       "Marc S. Orr and Shuai Che and Ayse Yilmazer and
                 Bradford M. Beckmann and Mark D. Hill and David A.
                 Wood",
  title =        "Synchronization Using Remote-Scope Promotion",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "73--86",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694350",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Heterogeneous system architecture (HSA) and OpenCL
                 define scoped synchronization to facilitate low
                 overhead communication across a subset of threads.
                 Scoped synchronization works well for static sharing
                 patterns, where consumer threads are known a priori. It
                 works poorly for dynamic sharing patterns (e.g., work
                 stealing) where programmers cannot use a faster small
                 scope due to the rare possibility that the work is
                 stolen by a thread in a distant slower scope. This puts
                 programmers in a conundrum: optimize the common case by
                 synchronizing at a faster small scope or use work
                 stealing at a slower large scope. In this paper, we
                 propose to extend scoped synchronization with
                 remote-scope promotion. This allows the most frequent
                 sharers to synchronize through a small scope.
                 Infrequent sharers synchronize by promoting that remote
                 small scope to a larger shared scope. Synchronization
                 using remote-scope promotion provides performance
                 robustness for dynamic workloads, where the benefits
                 provided by scoped synchronization and work stealing
                 are hard to anticipate. Compared to a na{\"\i}ve
                 baseline, static scoped synchronization alone achieves
                 a 1.07x speedup on average and dynamic work stealing
                 alone achieves a 1.18x speedup on average. In contrast,
                 synchronization using remote-scope promotion achieves a
                 robust 1.25x speedup on average, across a diverse set
                 of graph benchmarks and inputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Owaida:2015:EDS,
  author =       "Muhsen Owaida and Gabriel Falcao and Joao Andrade and
                 Christos Antonopoulos and Nikolaos Bellas and Madhura
                 Purnaprajna and David Novo and Georgios Karakonstantis
                 and Andreas Burg and Paolo Ienne",
  title =        "Enhancing Design Space Exploration by Extending
                 {CPU\slash GPU} Specifications onto {FPGAs}",
  journal =      j-TECS,
  volume =       "14",
  number =       "2",
  pages =        "33:1--33:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2656207",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Mar 26 05:58:56 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "The design cycle for complex special-purpose computing
                 systems is extremely costly and time-consuming. It
                 involves a multiparametric design space exploration for
                 optimization, followed by design verification.
                 Designers of special purpose VLSI implementations often
                 need to explore parameters, such as optimal bitwidth
                 and data representation, through time-consuming Monte
                 Carlo simulations. A prominent example of this
                 simulation-based exploration process is the design of
                 decoders for error correcting systems, such as the
                 Low-Density Parity-Check (LDPC) codes adopted by modern
                 communication standards, which involves thousands of
                 Monte Carlo runs for each design point. Currently,
                 high-performance computing offers a wide set of
                 acceleration options that range from multicore CPUs to
                 Graphics Processing Units (GPUs) and Field Programmable
                 Gate Arrays (FPGAs). The exploitation of diverse target
                 architectures is typically associated with developing
                 multiple code versions, often using distinct
                 programming paradigms. In this context, we evaluate the
                 concept of retargeting a single OpenCL program to
                 multiple platforms, thereby significantly reducing
                 design time. A single OpenCL-based parallel kernel is
                 used without modifications or code tuning on multicore
                 CPUs, GPUs, and FPGAs. We use SOpenCL (Silicon to
                 OpenCL), a tool that automatically converts OpenCL
                 kernels to RTL in order to introduce FPGAs as a
                 potential platform to efficiently execute simulations
                 coded in OpenCL. We use LDPC decoding simulations as a
                 case study. Experimental results were obtained by
                 testing a variety of regular and irregular LDPC codes
                 that range from short/medium (e.g., 8,000 bit) to long
                 length (e.g., 64,800 bit) DVB-S2 codes. We observe
                 that, depending on the design parameters to be
                 simulated, on the dimension and phase of the design,
                 the GPU or FPGA may suit different purposes more
                 conveniently, thus providing different acceleration
                 factors over conventional multicore CPUs.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Power:2015:GGH,
  author =       "Jason Power and Joel Hestness and Marc S. Orr and Mark
                 D. Hill and David A. Wood",
  title =        "{gem5-gpu}: A Heterogeneous {CPU--GPU} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "34--36",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2299539",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "gem5-gpu is a new simulator that models tightly
                 integrated CPU-GPU systems. It builds on gem5, a
                 modular full-system CPU simulator, and GPGPU-Sim, a
                 detailed GPGPU simulator. gem5-gpu routes most memory
                 accesses through Ruby, which is a highly configurable
                 memory system in gem5. By doing this, it is able to
                 simulate many system configurations, ranging from a
                 system with coherent caches and a single virtual
                 address space across the CPU and GPU to a system that
                 maintains separate GPU and CPU physical address spaces.
                 gem5-gpu can run most unmodified CUDA 3.2 source code.
                 Applications can launch non-blocking kernels, allowing
                 the CPU and GPU to execute simultaneously. We present
                 gem5-gpu's software architecture and a brief
                 performance validation. We also discuss possible
                 extensions to the simulator. gem5-gpu is open source
                 and available at gem5-gpu.cs.wisc.edu.",
  acknowledgement = ack-nhfb,
  affiliation =  "Power, J (Reprint Author), Univ Wisconsin, Dept Comp
                 Sci, 1210 W Dayton St, Madison, WI 53706 USA. Power,
                 Jason; Hestness, Joel; Orr, Marc S.; Hill, Mark D.;
                 Wood, David A., Univ Wisconsin, Dept Comp Sci, Madison,
                 WI 53706 USA.",
  author-email = "powerjg@cs.wisc.edu hestness@cs.wisc.edu
                 morr@cs.wisc.edu markhill@cs.wisc.edu
                 david@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "general-purpose graphics processors; heterogeneous
                 (hybrid) systems; Modeling techniques; simulators",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "62",
  unique-id =    "Power:2015:GGH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Reano:2015:IUE,
  author =       "Carlos Rea{\~n}o and Federico Silla and Adri{\'a}n
                 Castell{\'o} and Antonio J. Pe{\~n}a and Rafael Mayo
                 and Enrique S. Quintana-Ort{\'\i} and Jos{\'e} Duato",
  title =        "Improving the user experience of the {rCUDA} remote
                 {GPU} virtualization framework",
  journal =      j-CCPE,
  volume =       "27",
  number =       "14",
  pages =        "3746--3770",
  day =          "25",
  month =        sep,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3409",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Sep 28 09:32:54 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "10 Oct 2014",
}

@Article{Rico-Gallego:2015:ILM,
  author =       "Juan-Antonio Rico-Gallego and Juan-Carlos
                 D{\'\i}az-Mart{\'\i}n",
  title =        "{$ \tau $-Lop}: Modeling performance of shared memory
                 {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "46",
  number =       "??",
  pages =        "14--31",
  month =        jul,
  year =         "2015",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Jun 17 11:37:27 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819115000447",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Rodriguez:2015:OPI,
  author =       "Marcos Rodr{\'\i}guez and Fernando Blesa and Roberto
                 Barrio",
  title =        "{OpenCL} parallel integration of ordinary differential
                 equations: Applications in computational dynamics",
  journal =      j-COMP-PHYS-COMM,
  volume =       "192",
  number =       "??",
  pages =        "228--236",
  month =        jul,
  year =         "2015",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Apr 21 11:56:04 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515000703",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Sack:2015:CAM,
  author =       "Paul Sack and William Gropp",
  title =        "Collective Algorithms for Multiported Torus Networks",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686882",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Modern supercomputers with torus networks allow each
                 node to simultaneously pass messages on all of its
                 links. However, most collective algorithms are designed
                 to only use one link at a time. In this work, we
                 present novel multiported algorithms for the scatter,
                 gather, all-gather, and reduce-scatter operations. Our
                 algorithms can be combined to create multiported
                 reduce, all-reduce, and broadcast algorithms. Several
                 of these algorithms involve a new technique where we
                 relax the MPI message-ordering constraints to achieve
                 high performance and restore the correct ordering using
                 an additional stage of redundant communication.
                 According to our models, on an $n$-dimensional torus,
                 our algorithms should allow for nearly a $ 2 n$-fold
                 improvement in communication performance compared to
                 known, single-ported torus algorithms. In practice, we
                 have achieved nearly $ 6 \times $ better performance on
                 a 32k-node 3-dimensional torus.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Saillard:2015:SDV,
  author =       "Emmanuelle Saillard and Patrick Carribault and Denis
                 Barthou",
  title =        "Static\slash dynamic validation of {MPI} collective
                 communications in multi-threaded context",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "279--280",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scientific applications mainly rely on the MPI
                 parallel programming model to reach high performance on
                 supercomputers. The advent of manycore architectures
                 (larger number of cores and lower amount of memory per
                 core) leads to mix MPI with a thread-based model like
                 OpenMP. But integrating two different programming
                 models inside the same application can be tricky and
                 generate complex bugs. Thus, the correctness of hybrid
                 programs requires a special care regarding MPI calls
                 location. For example, identical MPI collective
                 operations cannot be performed by multiple
                 non-synchronized threads. To tackle this issue, this
                 paper proposes a static analysis and a reduced dynamic
                 instrumentation to detect bugs related to misuse of MPI
                 collective operations inside or outside threaded
                 regions. This work extends PARCOACH designed for
                 MPI-only applications and keeps the compatibility with
                 these algorithms. We validated our method on multiple
                 hybrid benchmarks and applications with a low
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Shterenlikht:2015:FC,
  author =       "Anton Shterenlikht and Lee Margetts and Luis Cebamanos
                 and David Henty",
  title =        "{Fortran 2008} coarrays",
  journal =      j-FORTRAN-FORUM,
  volume =       "34",
  number =       "1",
  pages =        "10--30",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2754942.2754944",
  ISSN =         "1061-7264 (print), 1931-1311 (electronic)",
  ISSN-L =       "1061-7264",
  bibdate =      "Mon Aug 10 06:22:12 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran-forum.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Coarrays are a Fortran 2008 standard feature intended
                 for SIMD type parallel programming. The runtime
                 environment starts a number of identical executable
                 images of the coarray program, on multiple processors,
                 which could be actual physical processors or threads.
                 Each image has a unique number and its private address
                 space. Ordinary variables are private to an image.
                 Coarray variables are available for read/write access
                 from any other image. Coarray communications are of
                 ``single sided'' type, i.e. a remote call from image A
                 to image B does not need to be accompanied by a
                 corresponding call in image B. This feature makes
                 coarray programming a lot simpler than MPI. The
                 standard provides synchronisation intrinsics to help
                 avoid race conditions or deadlocks. Any ordinary
                 variable can be made into a coarray --- scalars,
                 arrays, intrinsic or derived data types, pointers,
                 allocatables are all allowed. Coarrays can be declared
                 in, and passed to, procedures. Coarrays are thus very
                 flexible and can be used for a number of purposes. For
                 example a collection of coarrays from all or some
                 images can be thought of as a large single array. This
                 is precisely the inverse of the model partitioning
                 logic, typical in MPI programs. A coarray program can
                 exploit functional parallelism too, by delegating
                 distinct tasks to separate images or teams of images.
                 Coarray collectives are expected to become a part of
                 the next version of the Fortran standard. A major
                 unresolved problem of coarray programming is the lack
                 of standard parallel I/O facility in Fortran. In this
                 paper several simple complete coarray programs are
                 shown and compared to alternative parallel technologies
                 --- OpenMP, MPI and Fortran 2008 intrinsic ``do
                 concurrent''. Inter-image communication patterns and
                 data transfer are illustrated. An example of a
                 materials microstructure simulation coarray program
                 scaled up to 32k cores is shown. Problems with coarray
                 I/O at this scale are highlighted and addressed with
                 the use of MPI-I/O. A hybrid MPI/coarray programming is
                 discussed and illustrated with a finite
                 element/cellular automata (CAF{\'E}) multi-scale model.
                 The paper completes with a description of the new
                 coarray language features, expected in the 2015 Fortran
                 standard, and with a brief list of coarray resources",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Fortran Forum",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J286",
}

@Article{Sosonkina:2015:RAV,
  author =       "Masha Sosonkina and Layne T. Watson and Jian He",
  title =        "Remark on Algorithm 897: {VTDIRECT95}: Serial and
                 Parallel Codes for the Global Optimization Algorithm
                 {DIRECT}",
  journal =      j-TOMS,
  volume =       "41",
  number =       "3",
  pages =        "22:1--22:2",
  month =        jun,
  year =         "2015",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2699459",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Jun 3 17:59:32 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  note =         "See \cite{He:2009:AVS}.",
  abstract =     "The Fortran95 code VTDIRECT95, based on the original
                 MPI, has been modified to use MPI-2. An option for
                 VTDIRECT95 is to divide the feasible box into
                 subdomains, and concurrently apply the global direct
                 search algorithm DIRECT within each subdomain. When the
                 number of subdomains is greater than one, a bug causes
                 VTDIRECT95 to occasionally sample outside the given
                 feasible box, which is serious if the objective
                 function is not defined outside the given box. This bug
                 has been fixed, and the sample output files have been
                 updated to reflect the correction. For completeness,
                 the package VTDIRECT95 now contains both the MPI-1
                 (with the multiple subdomain bug fixed) and the MPI-2
                 versions of the code.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Spencer:2015:DLN,
  author =       "Matt Spencer and Jesse Eickholt and Jianlin Cheng",
  title =        "A deep learning network approach to ab initio protein
                 secondary structure prediction",
  journal =      j-TCBB,
  volume =       "12",
  number =       "1",
  pages =        "103--112",
  month =        jan,
  year =         "2015",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2014.2343960",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Aug 28 05:40:09 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "Ab initio protein secondary structure (SS) predictions
                 are utilized to generate tertiary structure
                 predictions, which are increasingly demanded due to the
                 rapid discovery of proteins. Although recent
                 developments have slightly exceeded previous methods of
                 SS prediction, accuracy has stagnated around 80 percent
                 and many wonder if prediction cannot be advanced beyond
                 this ceiling. Disciplines that have traditionally
                 employed neural networks are experimenting with novel
                 deep learning techniques in attempts to stimulate
                 progress. Since neural networks have historically
                 played an important role in SS prediction, we wanted to
                 determine whether deep learning could contribute to the
                 advancement of this field as well. We developed an SS
                 predictor that makes use of the position-specific
                 scoring matrix generated by PSI-BLAST and deep learning
                 network architectures, which we call DNSS. Graphical
                 processing units and CUDA software optimize the deep
                 network architecture and efficiently train the deep
                 networks. Optimal parameters for the training process
                 were determined, and a workflow comprising three
                 separately trained deep networks was constructed in
                 order to make refined predictions. This deep learning
                 network approach was used to predict SS for a fully
                 independent test dataset of 198 proteins, achieving a
                 Q3 accuracy of 80.7 percent and a Sov accuracy of 74.2
                 percent.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Spiechowicz:2015:GAM,
  author =       "J. Spiechowicz and M. Kostur and L. Machura",
  title =        "{GPU} accelerated {Monte Carlo} simulation of
                 {Brownian} motors dynamics with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "191",
  number =       "??",
  pages =        "140--149",
  month =        jun,
  year =         "2015",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Apr 24 18:44:55 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515000417",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Steuwer:2015:GPP,
  author =       "Michel Steuwer and Christian Fensch and Sam Lindley
                 and Christophe Dubach",
  title =        "Generating performance portable code using rewrite
                 rules: from high-level functional expressions to
                 high-performance {OpenCL} code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "205--217",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computers have become increasingly complex with the
                 emergence of heterogeneous hardware combining multicore
                 CPUs and GPUs. These parallel systems exhibit
                 tremendous computational power at the cost of increased
                 programming effort resulting in a tension between
                 performance and code portability. Typically, code is
                 either tuned in a low-level imperative language using
                 hardware-specific optimizations to achieve maximum
                 performance or is written in a high-level, possibly
                 functional, language to achieve portability at the
                 expense of performance. We propose a novel approach
                 aiming to combine high-level programming, code
                 portability, and high-performance. Starting from a
                 high-level functional expression we apply a simple set
                 of rewrite rules to transform it into a low-level
                 functional representation, close to the OpenCL
                 programming model, from which OpenCL code is generated.
                 Our rewrite rules define a space of possible
                 implementations which we automatically explore to
                 generate hardware-specific OpenCL implementations. We
                 formalize our system with a core dependently-typed
                 lambda-calculus along with a denotational semantics
                 which we use to prove the correctness of the rewrite
                 rules. We test our design in practice by implementing a
                 compiler which generates high performance imperative
                 OpenCL code. Our experiments show that we can
                 automatically derive hardware-specific implementations
                 from simple functional high-level algorithmic
                 expressions offering performance on a par with highly
                 tuned code for multicore CPUs and GPUs written by
                 experts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Takizawa:2015:ODT,
  author =       "Hiroyuki Takizawa and Shoichi Hirasawa and Makoto
                 Sugawara and Isaac Gelado and Hiroaki Kobayashi and
                 Wen-mei W. Hwu",
  title =        "Optimized Data Transfers Based on the {OpenCL} Event
                 Management Mechanism",
  journal =      j-SCI-PROG,
  volume =       "2015",
  number =       "??",
  pages =        "576498:1--576498:16",
  month =        "????",
  year =         "2015",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.1155/2015/576498",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 20 07:53:44 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  URL =          "https://www.hindawi.com/journals/sp/2015/576498/",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "https://www.hindawi.com/journals/sp/",
  journalabr =   "Sci. Prog",
}

@Article{Tennyson:2015:MOI,
  author =       "P. Gerald Tennyson and G. M. Karthik and G.
                 Phanikumar",
  title =        "{MPI + OpenCL} implementation of a phase-field method
                 incorporating {CALPHAD} description of {Gibbs} energies
                 on heterogeneous computing platforms",
  journal =      j-COMP-PHYS-COMM,
  volume =       "186",
  number =       "??",
  pages =        "48--64",
  month =        jan,
  year =         "2015",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2014.09.014",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Nov 10 08:38:05 MST 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465514003208",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Thebault:2015:SEI,
  author =       "Lo{\"\i}c Th{\'e}bault and Eric Petit and Quang Dinh",
  title =        "Scalable and efficient implementation of {$3$D}
                 unstructured meshes computation: a case study on matrix
                 assembly",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "120--129",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Exposing massive parallelism on 3D unstructured meshes
                 computation with efficient load balancing and minimal
                 synchronizations is challenging. Current approaches
                 relying on domain decomposition and mesh coloring
                 struggle to scale with the increasing number of cores
                 per nodes, especially with new many-core processors. In
                 this paper, we propose an hybrid approach using domain
                 decomposition to exploit distributed memory
                 parallelism, Divide-and-Conquer, D{\&}C, to exploit
                 shared memory parallelism and improve locality, and
                 mesh coloring at core level to exploit vectors. It
                 illustrates a new trade-off for many-cores between
                 structuredness, memory locality, and vectorization. We
                 evaluate our approach on the finite element matrix
                 assembly of an industrial fluid dynamic code developed
                 by Dassault Aviation. We compare our D{\&}C approach to
                 domain decomposition and to mesh coloring. D{\&}C
                 achieves a high parallel efficiency, a good data
                 locality as well as an improved bandwidth usage. It
                 competes on current nodes with the optimized pure MPI
                 version with a minimum 10\% speed-up. D{\&}C shows an
                 impressive 319x strong scaling on 512 cores (32 nodes)
                 with only 2000 vertices per core. Finally, the Intel
                 Xeon Phi version has a performance similar to 10 Intel
                 E5-2665 Xeon Sandy Bridge cores and 95\% parallel
                 efficiency on the 60 physical cores. Running on 4 Xeon
                 Phi (240 cores), D{\&}C has 92\% efficiency on the
                 physical cores and performance similar to 33 Intel
                 E5-2665 Xeon Sandy Bridge cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Thompson:2015:PCI,
  author =       "Elizabeth Thompson and Nathan Clem and David A.
                 Peter",
  title =        "Parallel {CUDA} implementation of conflict detection
                 for application to airspace deconfliction",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "10",
  pages =        "3787--3810",
  month =        oct,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-015-1467-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Sep 29 10:07:24 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/10;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-015-1467-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Vapirev:2015:IRC,
  author =       "A. Vapirev and J. Deca and G. Lapenta and S. Markidis
                 and I. Hur and J.-L. Cambier",
  title =        "Initial results on computational performance of
                 {Intel} many integrated core, {Sandy Bridge}, and
                 graphical processing unit architectures: implementation
                 of a {$1$D C++\slash OpenMP} electrostatic
                 particle-in-cell code",
  journal =      j-CCPE,
  volume =       "27",
  number =       "3",
  pages =        "581--593",
  day =          "10",
  month =        mar,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3248",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Jul 25 19:54:06 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "6 Mar 2014",
}

@Article{Verschelde:2015:PHC,
  author =       "Jan Verschelde and Xiangcheng Yu",
  title =        "Polynomial homotopy continuation on {GPUs}",
  journal =      j-ACM-COMM-COMP-ALGEBRA,
  volume =       "49",
  number =       "4",
  pages =        "130--133",
  month =        dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2893803.2893810",
  ISSN =         "1932-2232 (print), 1932-2240 (electronic)",
  ISSN-L =       "1932-2232",
  bibdate =      "Wed Feb 17 16:05:57 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigsam.bib",
  abstract =     "The purpose of the software presentation is to
                 announce a library to track many solution paths defined
                 by a polynomial homotopy on a Graphics Processing Unit
                 (GPU). Developed on NVIDIA graphics cards with CUDA
                 SDKs, our code is released under the GNU GPL license.
                 Via the C interface to PHCpack, we can call our GPU
                 library from Python.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Communications in Computer Algebra",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1000",
}

@Article{Wang:2015:AST,
  author =       "Chun-Kun Wang and Peng-Sheng Chen",
  title =        "Automatic scoping of task clauses for the {OpenMP}
                 tasking model",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "3",
  pages =        "808--823",
  month =        mar,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1326-3",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Aug 8 12:23:09 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1326-3",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Wickerson:2015:RSP,
  author =       "John Wickerson and Mark Batty and Bradford M. Beckmann
                 and Alastair F. Donaldson",
  title =        "Remote-scope promotion: clarified, rectified, and
                 verified",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "731--747",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814283",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern accelerator programming frameworks, such as
                 OpenCL, organise threads into work-groups. Remote-scope
                 promotion (RSP) is a language extension recently
                 proposed by AMD researchers that is designed to enable
                 applications, for the first time, both to optimise for
                 the common case of intra-work-group communication
                 (using memory scopes to provide consistency only within
                 a work-group) and to allow occasional inter-work-group
                 communication (as required, for instance, to support
                 the popular load-balancing idiom of work stealing). We
                 present the first formal, axiomatic memory model of
                 OpenCL extended with RSP. We have extended the Herd
                 memory model simulator with support for OpenCL kernels
                 that exploit RSP, and used it to discover bugs in
                 several litmus tests and a work-stealing queue, that
                 have been used previously in the study of RSP. We have
                 also formalised the proposed GPU implementation of RSP.
                 The formalisation process allowed us to identify bugs
                 in the description of RSP that could result in
                 well-synchronised programs experiencing memory
                 inconsistencies. We present and prove sound a new
                 implementation of RSP that incorporates bug fixes and
                 requires less non-standard hardware than the original
                 implementation. This work, a collaboration between
                 academia and industry, clearly demonstrates how, when
                 designing hardware support for a new concurrent
                 language feature, the early application of formal tools
                 and techniques can help to prevent errors, such as
                 those we have found, from making it into silicon.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Yazdanpanah:2015:PHR,
  author =       "Fahimeh Yazdanpanah and Carlos {\'A}lvarez and Daniel
                 Jim{\'e}nez-Gonz{\'a}lez and Rosa M. Badia and Mateo
                 Valero",
  title =        "{Picos}: a hardware runtime architecture support for
                 {OmpSs}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "53",
  number =       "??",
  pages =        "130--139",
  month =        dec,
  year =         "2015",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2014.12.010",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Aug 12 13:56:06 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X14002702",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X/",
  keywords =     "OpenMP",
}

@Article{You:2015:VFO,
  author =       "Yi-Ping You and Hen-Jung Wu and Yeh-Ning Tsai and
                 Yen-Ting Chao",
  title =        "{VirtCL}: a framework for {OpenCL} device abstraction
                 and management",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "161--172",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688505",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The interest in using multiple graphics processing
                 units (GPUs) to accelerate applications has increased
                 in recent years. However, the existing heterogeneous
                 programming models (e.g., OpenCL) abstract details of
                 GPU devices at the per-device level and require
                 programmers to explicitly schedule their kernel tasks
                 on a system equipped with multiple GPU devices.
                 Unfortunately, multiple applications running on a
                 multi-GPU system may compete for some of the GPU
                 devices while leaving other GPU devices unused.
                 Moreover, the distributed memory model defined in
                 OpenCL, where each device has its own memory space,
                 increases the complexity of managing the memory among
                 multiple GPU devices. In this article we propose a
                 framework (called VirtCL) that reduces the programming
                 burden by acting as a layer between the programmer and
                 the native OpenCL run-time system for abstracting
                 multiple devices into a single virtual device and for
                 scheduling computations and communications among the
                 multiple devices. VirtCL comprises two main components:
                 (1) a front-end library, which exposes primary OpenCL
                 APIs and the virtual device, and (2) a back-end
                 run-time system (called CLDaemon) for scheduling and
                 dispatching kernel tasks based on a history-based
                 scheduler. The front-end library forwards computation
                 requests to the back-end CLDaemon, which then schedules
                 and dispatches the requests. We also propose a
                 history-based scheduler that is able to schedule kernel
                 tasks in a contention- and communication-aware manner.
                 Experiments demonstrated that the VirtCL framework
                 introduced a small overhead (mean of 6\%) but
                 outperformed the native OpenCL run-time system for most
                 benchmarks in the Rodinia benchmark suite, which was
                 due to the abstraction layer eliminating the
                 time-consuming initialization of OpenCL contexts. We
                 also evaluated different scheduling policies in VirtCL
                 with a real-world application (clsurf) and various
                 synthetic workload traces. The results indicated that
                 the VirtCL framework provides scalability for multiple
                 kernel tasks running on multi-GPU systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Younge:2015:SHP,
  author =       "Andrew J. Younge and John Paul Walters and Stephen P.
                 Crago and Geoffrey C. Fox",
  title =        "Supporting High Performance Molecular Dynamics in
                 Virtualized Clusters using {IOMMU}, {SR-IOV}, and
                 {GPUDirect}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "31--38",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Cloud Infrastructure-as-a-Service paradigms have
                 recently shown their utility for a vast array of
                 computational problems, ranging from advanced web
                 service architectures to high throughput computing.
                 However, many scientific computing applications have
                 been slow to adapt to virtualized cloud frameworks.
                 This is due to performance impacts of virtualization
                 technologies, coupled with the lack of advanced
                 hardware support necessary for running many high
                 performance scientific applications at scale. By using
                 KVM virtual machines that leverage both Nvidia GPUs and
                 InfiniBand, we show that molecular dynamics simulations
                 with LAMMPS and HOOMD run at near-native speeds. This
                 experiment also illustrates how virtualized
                 environments can support the latest parallel computing
                 paradigms, including both MPI+CUDA and new GPUDirect
                 RDMA functionality. Specific findings show initial
                 promise in scaling of such applications to larger
                 production deployments targeting large scale
                 computational workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Zarrabi:2015:GSA,
  author =       "Amirreza Zarrabi and Khairulmizam Samsudin and Ettikan
                 K. Karuppiah",
  title =        "Gravitational search algorithm using {CUDA}: a case
                 study in high-performance metaheuristics",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "4",
  pages =        "1277--1296",
  month =        apr,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-014-1360-1",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Aug 8 12:23:10 MDT 2015",
  bibsource =    "http://link.springer.com/journal/11227/71/4;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-014-1360-1",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Zhu:2015:PIM,
  author =       "Xiangyuan Zhu and Kenli Li and Ahmad Salah and Lin Shi
                 and Keqin Li",
  title =        "Parallel implementation of {MAFFT} on {CUDA}-enabled
                 graphics hardware",
  journal =      j-TCBB,
  volume =       "12",
  number =       "1",
  pages =        "205--218",
  month =        jan,
  year =         "2015",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2014.2351801",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Aug 28 05:40:09 MDT 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "Multiple sequence alignment (MSA) constitutes an
                 extremely powerful tool for many biological
                 applications including phylogenetic tree estimation,
                 secondary structure prediction, and critical residue
                 identification. However, aligning large biological
                 sequences with popular tools such as MAFFT requires
                 long runtimes on sequential architectures. Due to the
                 ever increasing sizes of sequence databases, there is
                 increasing demand to accelerate this task. In this
                 paper, we demonstrate how graphic processing units
                 (GPUs), powered by the compute unified device
                 architecture (CUDA), can be used as an efficient
                 computational platform to accelerate the MAFFT
                 algorithm. To fully exploit the GPU's capabilities for
                 accelerating MAFFT, we have optimized the sequence data
                 organization to eliminate the bandwidth bottleneck of
                 memory access, designed a memory allocation and reuse
                 strategy to make full use of limited memory of GPUs,
                 proposed a new modified-run-length encoding (MRLE)
                 scheme to reduce memory consumption, and used
                 high-performance shared memory to speed up I/O
                 operations. Our implementation tested in three NVIDIA
                 GPUs achieves speedup up to 11.28 on a Tesla K20m GPU
                 compared to the sequential MAFFT 7.015.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Zhu:2015:PML,
  author =       "Leqing Zhu and Yadong Zhou and Daxing Zhang and Dadong
                 Wang and Huiyan Wang and Xun Wang",
  title =        "Parallel multi-level {2D-DWT} on {CUDA GPUs} and its
                 application in ring artifact removal",
  journal =      j-CCPE,
  volume =       "27",
  number =       "17",
  pages =        "5188--5202",
  day =          "10",
  month =        dec,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3559",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:20 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "5 Jun 2015",
}

@Article{Abdelfattah:2016:KOL,
  author =       "Ahmad Abdelfattah and David Keyes and Hatem Ltaief",
  title =        "{KBLAS}: an Optimized Library for Dense Matrix-Vector
                 Multiplication on {GPU} Accelerators",
  journal =      j-TOMS,
  volume =       "42",
  number =       "3",
  pages =        "18:1--18:31",
  month =        may,
  year =         "2016",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2818311",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon May 23 16:40:02 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "KBLAS is an open-source, high-performance library that
                 provides optimized kernels for a subset of Level 2 BLAS
                 functionalities on CUDA-enabled GPUs. Since performance
                 of dense matrix-vector multiplication is hindered by
                 the overhead of memory accesses, a double-buffering
                 optimization technique is employed to overlap data
                 motion with computation. After identifying a proper set
                 of tuning parameters, KBLAS efficiently runs on various
                 GPU architectures while avoiding code rewriting and
                 retaining compliance with the standard BLAS API.
                 Another optimization technique allows ensuring
                 coalesced memory access when dealing with submatrices,
                 especially for high-level dense linear algebra
                 algorithms. All KBLAS kernels have been leveraged to a
                 multi-GPU environment, which requires the introduction
                 of new APIs. Considering general matrices, KBLAS is
                 very competitive with existing state-of-the-art kernels
                 and provides a smoother performance across a wide range
                 of matrix dimensions. Considering symmetric and
                 Hermitian matrices, the KBLAS performance outperforms
                 existing state-of-the-art implementations on all matrix
                 sizes and achieves asymptotically up to 50\% and 60\%
                 speedup against the best competitor on single GPU and
                 multi-GPUs systems, respectively. Performance results
                 also validate our performance model. A subset of KBLAS
                 high-performance kernels have been integrated into
                 NVIDIA's standard BLAS implementation (cuBLAS) for
                 larger dissemination, starting from version 6.0.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Aji:2016:MAA,
  author =       "Ashwin M. Aji and Lokendra S. Panwar and Feng Ji and
                 Karthik Murthy and Milind Chabbi and Pavan Balaji and
                 Keith R. Bisset and James Dinan and Wu-chun Feng and
                 John Mellor-Crummey and Xiaosong Ma and Rajeev Thakur",
  title =        "{MPI-ACC}: Accelerator-Aware {MPI} for Scientific
                 Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "5",
  pages =        "1401--1414",
  month =        may,
  year =         "2016",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2015.2446479",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Apr 15 13:45:22 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2016/05/07127020-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2016/05/07127020-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Aji:2016:MEA,
  author =       "Ashwin M. Aji and Antonio J. Pe{\~n}a and Pavan Balaji
                 and Wu-chun Feng",
  title =        "{MultiCL}: Enabling automatic scheduling for
                 task-parallel workloads in {OpenCL}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "58",
  number =       "??",
  pages =        "37--55",
  month =        oct,
  year =         "2016",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Sep 27 08:00:38 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819116300357",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Aldea:2016:OES,
  author =       "Sergio Aldea and Alvaro Estebanez and Diego R. Llanos
                 and Arturo Gonzalez-Escribano",
  title =        "An {OpenMP} Extension that Supports Thread-Level
                 Speculation",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "1",
  pages =        "78--91",
  month =        jan,
  year =         "2016",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2015.2393870",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Dec 15 09:28:10 MST 2015",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2016/01/07014262-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2016/01/07014262-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{AlQuraishi:2016:CBP,
  author =       "Eman AlQuraishi and Eman AlDwaisan and Alaa AlSaqaa
                 and Imtiaz Ahmad",
  title =        "A {CUDA}-based parallel implementation of a test
                 vectors encoding algorithm in compression-based scan
                 designs",
  journal =      j-INT-J-PAR-EMER-DIST-SYS,
  volume =       "31",
  number =       "3",
  pages =        "280--293",
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1080/17445760.2015.1016516",
  ISSN =         "1744-5760 (print), 1744-5779 (electronic)",
  ISSN-L =       "1744-5760",
  bibdate =      "Mon Sep 12 09:19:42 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/intjparemerdistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.tandfonline.com/toc/gpaa20/31/3",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel, Emergent and
                 Distributed Systems: IJPEDS",
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
  onlinedate =   "05 Mar 2015",
}

@Article{Andion:2016:LAA,
  author =       "Jos{\'e} M. Andi{\'o}n and Manuel Arenaz and
                 Fran{\c{c}}ois Bodin and Gabriel Rodr{\'\i}guez and
                 Juan Touri{\~n}o",
  title =        "Locality-Aware Automatic Parallelization for {GPGPU}
                 with {OpenHMPP} Directives",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "44",
  number =       "3",
  pages =        "620--643",
  month =        jun,
  year =         "2016",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-015-0362-9",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Mon May 30 09:25:55 MDT 2016",
  bibsource =    "http://link.springer.com/journal/10766/44/3;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-015-0362-9",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Andujar:2016:OSF,
  author =       "Francisco J. And{\'u}jar and Juan A. Villar and
                 Francisco J. Alfaro and Jos{\'e} L. S{\'a}nchez and
                 Jesus Escudero-Sahuquillo",
  title =        "An open-source family of tools to reproduce
                 {MPI}-based workloads in interconnection network
                 simulators",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "72",
  number =       "12",
  pages =        "4601--4628",
  month =        dec,
  year =         "2016",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1757-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:30 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/72/12;
                 http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Aubrey-Jones:2016:SMI,
  author =       "Tristan Aubrey-Jones and Bernd Fischer",
  title =        "Synthesizing {MPI} Implementations from Functional
                 Data-Parallel Programs",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "44",
  number =       "3",
  pages =        "552--573",
  month =        jun,
  year =         "2016",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-015-0359-4",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Mon May 30 09:25:55 MDT 2016",
  bibsource =    "http://link.springer.com/journal/10766/44/3;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-015-0359-4",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Bader:2016:EMT,
  author =       "David A. Bader",
  title =        "Evolving {MPI+X} Toward Exascale",
  journal =      j-COMPUTER,
  volume =       "49",
  number =       "8",
  pages =        "10--10",
  month =        aug,
  year =         "2016",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Tue Aug 23 06:56:16 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computer2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://csdl.computer.org/csdl/mags/co/2016/08/mco2016080010.html",
  abstract-URL = "http://csdl.computer.org/csdl/mags/co/2016/08/mco2016080010-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/computer",
}

@Article{Batty:2016:OSA,
  author =       "Mark Batty and Alastair F. Donaldson and John
                 Wickerson",
  title =        "Overhauling {SC} atomics in {C11} and {OpenCL}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "634--648",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837637",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the conceptual simplicity of sequential
                 consistency (SC), the semantics of SC atomic operations
                 and fences in the C11 and OpenCL memory models is
                 subtle, leading to convoluted prose descriptions that
                 translate to complex axiomatic formalisations. We
                 conduct an overhaul of SC atomics in C11, reducing the
                 associated axioms in both number and complexity. A
                 consequence of our simplification is that the SC
                 operations in an execution no longer need to be totally
                 ordered. This relaxation enables, for the first time,
                 efficient and exhaustive simulation of litmus tests
                 that use SC atomics. We extend our improved C11 model
                 to obtain the first rigorous memory model formalisation
                 for OpenCL (which extends C11 with support for
                 heterogeneous many-core programming). In the OpenCL
                 setting, we refine the SC axioms still further to give
                 a sensible semantics to SC operations that employ a
                 `memory scope' to restrict their visibility to specific
                 threads. Our overhaul requires slight strengthenings of
                 both the C11 and the OpenCL memory models, causing some
                 behaviours to become disallowed. We argue that these
                 strengthenings are natural, and that all of the
                 formalised C11 and OpenCL compilation schemes of which
                 we are aware (Power and x86 CPUs for C11, AMD GPUs for
                 OpenCL) remain valid in our revised models. Using the
                 HERD memory model simulator, we show that our overhaul
                 leads to an exponential improvement in simulation time
                 for C11 litmus tests compared with the original model,
                 making *exhaustive* simulation competitive, time-wise,
                 with the *non-exhaustive* CDSChecker tool.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Bolis:2016:APA,
  author =       "A. Bolis and C. D. Cantwell and D. Moxey and D. Serson
                 and S. J. Sherwin",
  title =        "An adaptable parallel algorithm for the direct
                 numerical simulation of incompressible turbulent flows
                 using a {Fourier} spectral\slash $hp$ element method
                 and {MPI} virtual topologies",
  journal =      j-COMP-PHYS-COMM,
  volume =       "206",
  number =       "??",
  pages =        "17--25",
  month =        sep,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jun 10 18:27:25 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S001046551630100X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Calore:2016:PPA,
  author =       "Enrico Calore and Alessandro Gabbana and Jiri Kraus
                 and Sebastiano Fabio Schifano and Raffaele
                 Tripiccione",
  title =        "Performance and portability of accelerated lattice
                 {Boltzmann} applications with {OpenACC}",
  journal =      j-CCPE,
  volume =       "28",
  number =       "12",
  pages =        "3485--3502",
  day =          "25",
  month =        aug,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3862",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Sep 13 08:30:12 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Chang:2016:APC,
  author =       "Chih-Hung Chang and Chih-Wei Lu and Chao-Tung Yang and
                 Tzu-Chieh Chang",
  title =        "An approach of performance comparisons with {OpenMP}
                 and {CUDA} parallel programming on multicore systems",
  journal =      j-CCPE,
  volume =       "28",
  number =       "16",
  pages =        "4230--4245",
  month =        nov,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3829",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Nov 17 07:11:02 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Chang:2016:DLD,
  author =       "Li-Wen Chang and Hee-Seok Kim and Wen-mei W. Hwu",
  title =        "{DySel}: Lightweight Dynamic Selection for
                 Kernel-based Data-parallel Programming Model",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "667--680",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The rising pressure for simultaneously improving
                 performance and reducing power is driving more
                 diversity into all aspects of computing devices. An
                 algorithm that is well-matched to the target hardware
                 can run multiple times faster and more energy
                 efficiently than one that is not. The problem is
                 complicated by the fact that a program's input also
                 affects the appropriate choice of algorithm. As a
                 result, software developers have been faced with the
                 challenge of determining the appropriate algorithm for
                 each potential combination of target device and data.
                 This paper presents DySel, a novel runtime system for
                 automating such determination for kernel-based data
                 parallel programming models such as OpenCL, CUDA,
                 OpenACC, and C++AMP. These programming models cover
                 many applications that demand high performance in
                 mobile, cloud and high-performance computing. DySel
                 systematically deploys candidate kernels on a small
                 portion of the actual data to determine which achieves
                 the best performance for the hardware-data combination.
                 The test-deployment, referred to as micro-profiling,
                 contributes to the final execution result and incurs
                 less than 8\% of overhead in the worst observed case
                 when compared to an oracle. We show four major use
                 cases where DySel provides significantly more
                 consistent performance without tedious effort from the
                 developer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Cores:2016:ROM,
  author =       "Iv{\'a}n Cores and M{\'o}nica Rodr{\'\i}guez and
                 Patricia Gonz{\'a}lez and Mar{\'\i}a J. Mart{\'\i}n",
  title =        "Reducing the overhead of an {MPI} application-level
                 migration approach",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "54",
  number =       "??",
  pages =        "72--82",
  month =        may,
  year =         "2016",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed May 4 17:36:47 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819116000429",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Creech:2016:TSS,
  author =       "Timothy Creech and Rajeev Barua",
  title =        "Transparently Space Sharing a Multicore Among Multiple
                 Processes",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001910",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As hardware becomes increasingly parallel and the
                 availability of scalable parallel software improves,
                 the problem of managing multiple multithreaded
                 applications (processes) becomes important. Malleable
                 processes, which can vary the number of threads used as
                 they run, enable sophisticated and flexible resource
                 management. Although many existing applications
                 parallelized for SMPs with parallel runtimes are in
                 fact already malleable, deployed runtime environments
                 provide no interface nor any strategy for intelligently
                 allocating hardware threads or even preventing
                 oversubscription. Prior research methods either depend
                 on profiling applications ahead of time to make good
                 decisions about allocations or do not account for
                 process efficiency at all, leading to poor performance.
                 None of these prior methods have been adapted widely in
                 practice. This article presents the Scheduling and
                 Allocation with Feedback (SCAF) system: a drop-in
                 runtime solution that supports existing malleable
                 applications in making intelligent allocation decisions
                 based on observed efficiency without any changes to
                 semantics, program modification, offline profiling, or
                 even recompilation. Our existing implementation can
                 control most unmodified OpenMP applications. Other
                 malleable threading libraries can also easily be
                 supported with small modifications without requiring
                 application modification or recompilation. In this
                 work, we present the SCAF daemon and a SCAF-aware port
                 of the GNU OpenMP runtime. We present a new technique
                 for estimating process efficiency purely at runtime
                 using available hardware counters and demonstrate its
                 effectiveness in aiding allocation decisions. We
                 evaluated SCAF using NAS NPB parallel benchmarks on
                 five commodity parallel platforms, enumerating
                 architectural features and their effects on our scheme.
                 We measured the benefit of SCAF in terms of sum of
                 speedups improvement (a common metric for
                 multiprogrammed environments) when running all
                 benchmark pairs concurrently compared to
                 equipartitioning-the best existing competing scheme in
                 the literature. We found that SCAF improves on
                 equipartitioning on four out of five machines, showing
                 a mean improvement factor in sum of speedups of 1.04 to
                 1.11x for benchmark pairs, depending on the machine,
                 and 1.09x on average. Since we are not aware of any
                 widely available tool for equipartitioning, we also
                 compare SCAF against multiprogramming using unmodified
                 OpenMP, which is the only environment available to end
                 users today. SCAF improves on the unmodified OpenMP
                 runtimes for all five machines, with a mean improvement
                 of 1.08 to 2.07x, depending on the machine, and 1.59x
                 on average.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Creel:2016:NJM,
  author =       "Michael Creel",
  title =        "A Note on {Julia} and {MPI}, with Code Examples",
  journal =      j-COMP-ECONOMICS,
  volume =       "48",
  number =       "3",
  pages =        "??--??",
  month =        "",
  year =         "2016",
  CODEN =        "CNOMEL",
  DOI =          "https://doi.org/10.1007/s10614-015-9516-5",
  ISSN =         "",
  ISSN-L =       "0927-7099",
  bibdate =      "Fri Apr 9 07:54:52 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/julia.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10614-015-9516-5",
  acknowledgement = ack-nhfb,
  fjournal =     "Computational Economics",
}

@Book{Czech:2016:IPC,
  author =       "Zbigniew J. Czech",
  title =        "Introduction to Parallel Computing",
  publisher =    pub-CAMBRIDGE,
  address =      pub-CAMBRIDGE:adr,
  pages =        "xvii + 354",
  year =         "2016",
  DOI =          "https://doi.org/10.1017/9781316795835",
  ISBN =         "1-107-17439-2 (hardcover), 1-316-79583-7 (e-book)",
  ISBN-13 =      "978-1-107-17439-9 (hardcover), 978-1-316-79583-5
                 (e-book)",
  LCCN =         "QA76.58 .C975 2016",
  bibdate =      "Fri Mar 31 11:22:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib;
                 z3950.loc.gov:7090/Voyager",
  abstract =     "The constantly increasing demand for more computing
                 power can seem impossible to keep up with. However,
                 multicore processors capable of performing computations
                 in parallel allow computers to tackle ever larger
                 problems in a wide variety of applications. This book
                 provides a comprehensive introduction to parallel
                 computing, discussing theoretical issues such as the
                 fundamentals of concurrent processes, models of
                 parallel and distributed computing, and metrics for
                 evaluating and comparing parallel algorithms, as well
                 as practical issues, including methods of designing and
                 implementing shared- and distributed-memory programs,
                 and standards for parallel program implementation, in
                 particular MPI and OpenMP interfaces. Each chapter
                 presents the basics in one place followed by advanced
                 topics, allowing novices and experienced practitioners
                 to quickly find what they need. A glossary and more
                 than 80 exercises with selected solutions aid
                 comprehension. The book is recommended as a text for
                 advanced undergraduate or graduate students and as a
                 reference for practitioners.",
  acknowledgement = ack-nhfb,
  subject =      "Parallel processing (Electronic computers)",
  tableofcontents = "Concurrent processes \\
                 Basic models of parallel computation \\
                 Elementary parallel algorithms \\
                 Designing parallel algorithms \\
                 Architectures of parallel computers \\
                 Message-passing programming \\
                 Shared-memory programming",
}

@Article{Dathathri:2016:CAL,
  author =       "Roshan Dathathri and Ravi Teja Mullapudi and Uday
                 Bondhugula",
  title =        "Compiling Affine Loop Nests for a Dynamic Scheduling
                 Runtime on Shared and Distributed Memory",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948975",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Current de-facto parallel programming models like
                 OpenMP and MPI make it difficult to extract task-level
                 dataflow parallelism as opposed to bulk-synchronous
                 parallelism. Task parallel approaches that use
                 point-to-point synchronization between dependent tasks
                 in conjunction with dynamic scheduling dataflow
                 runtimes are thus becoming attractive. Although good
                 performance can be extracted for both shared and
                 distributed memory using these approaches, there is
                 little compiler support for them. In this article, we
                 describe the design of compiler--runtime interaction to
                 automatically extract coarse-grained dataflow
                 parallelism in affine loop nests for both shared and
                 distributed-memory architectures. We use techniques
                 from the polyhedral compiler framework to extract tasks
                 and generate components of the runtime that are used to
                 dynamically schedule the generated tasks. The runtime
                 includes a distributed decentralized scheduler that
                 dynamically schedules tasks on a node. The schedulers
                 on different nodes cooperate with each other through
                 asynchronous point-to-point communication, and all of
                 this is achieved by code automatically generated by the
                 compiler. On a set of six representative affine loop
                 nest benchmarks, while running on 32 nodes with 8
                 threads each, our compiler-assisted runtime yields a
                 geometric mean speedup of $ 143.6 \times $ ($ 70.3
                 \times $ to $ 474.7 \times $) over the sequential
                 version and a geometric mean speedup of $ 1.64 \times $
                 ($ 1.04 \times $ to $ 2.42 \times $) over the
                 state-of-the-art automatic parallelization approach
                 that uses bulk synchronization. We also compare our
                 system with past work that addresses some of these
                 challenges on shared memory, and an emerging runtime
                 (Intel Concurrent Collections) that demands higher
                 programmer input and effort in parallelizing. To the
                 best of our knowledge, ours is also the first automatic
                 scheme that allows for dynamic scheduling of affine
                 loop nests on a cluster of multicores.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Deniz:2016:MGM,
  author =       "Etem Deniz and Alper Sen",
  title =        "{MINIME-GPU}: Multicore Benchmark Synthesizer for
                 {GPUs}",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce MINIME-GPU, a novel automated benchmark
                 synthesis framework for graphics processing units
                 (GPUs) that serves to speed up architectural simulation
                 of modern GPU architectures. Our framework captures
                 important characteristics of original GPU applications
                 and generates synthetic GPU benchmarks using the Open
                 Computing Language (OpenCL) library from those
                 applications. To the best of our knowledge, this is the
                 first time synthetic OpenCL benchmarks for GPUs are
                 generated from existing applications. We use several
                 characteristics, including instruction throughput,
                 compute unit occupancy, and memory efficiency, to
                 compare the similarity of original applications and
                 their corresponding synthetic benchmarks. The
                 experimental results show that our synthetic benchmark
                 generation framework is capable of generating synthetic
                 benchmarks that have similar characteristics with the
                 original applications from which they are generated. On
                 average, the similarity (accuracy) is 96\% and the
                 speedup is 541 $ \times $ . In addition, our synthetic
                 benchmarks use the OpenCL library, which allows us to
                 obtain portable human readable benchmarks as opposed to
                 using assembly-level code, and they are faster and
                 smaller than the original applications from which they
                 are generated. We experimentally validated that our
                 synthetic benchmarks preserve the characteristics of
                 the original applications across different
                 architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dinan:2016:IEM,
  author =       "James Dinan and Pavan Balaji and Darius Buntinas and
                 David Goodell and William Gropp and Rajeev Thakur",
  title =        "An implementation and evaluation of the {MPI 3.0}
                 one-sided communication interface",
  journal =      j-CCPE,
  volume =       "28",
  number =       "17",
  pages =        "4385--4404",
  day =          "10",
  month =        dec,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3758",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Nov 17 07:11:03 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{DiPietro:2016:CLD,
  author =       "Roberto {Di Pietro} and Flavio Lombardi and Antonio
                 Villani",
  title =        "{CUDA} Leaks: a Detailed Hack for {CUDA} and a
                 (Partial) Fix",
  journal =      j-TECS,
  volume =       "15",
  number =       "1",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2801153",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Wed Jun 8 09:43:30 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Graphics processing units (GPUs) are increasingly
                 common on desktops, servers, and embedded platforms. In
                 this article, we report on new security issues related
                 to CUDA, which is the most widespread platform for GPU
                 computing. In particular, details and proofs-of-concept
                 are provided about novel vulnerabilities to which CUDA
                 architectures are subject. We show how such
                 vulnerabilities can be exploited to cause severe
                 information leakage. As a case study, we experimentally
                 show how to exploit one of these vulnerabilities on a
                 GPU implementation of the AES encryption algorithm.
                 Finally, we also suggest software patches and
                 alternative approaches to tackle the presented
                 vulnerabilities.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Eckert:2016:HAL,
  author =       "C. H. J. Eckert and E. Zenker and M. Bussmann and D.
                 Albach",
  title =        "{HASEonGPU} --- an adaptive, load-balanced {MPI\slash
                 GPU}-code for calculating the amplified spontaneous
                 emission in high power laser media",
  journal =      j-COMP-PHYS-COMM,
  volume =       "207",
  number =       "??",
  pages =        "362--374",
  month =        oct,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Aug 30 18:08:51 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465516301436",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Fabeiro:2016:WPP,
  author =       "Jorge F. Fabeiro and Diego Andrade and Basilio B.
                 Fraguela",
  title =        "Writing a performance-portable matrix multiplication",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "52",
  number =       "??",
  pages =        "65--77",
  month =        feb,
  year =         "2016",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2015.12.005",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 12 18:56:20 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819115001611",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
  keywords =     "GPU; Heterogeneous Programming Library (HPL); Intel
                 Xeon Phi; MPI; OpenCL; OpenMP",
}

@Article{Gong:2016:NPG,
  author =       "Jing Gong and Stefano Markidis and Erwin Laure and
                 Matthew Otten and Paul Fischer and Misun Min",
  title =        "Nekbone performance on {GPUs} with {OpenACC} and
                 {CUDA} {Fortran} implementations",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "72",
  number =       "11",
  pages =        "4160--4180",
  month =        nov,
  year =         "2016",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1744-5",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:30 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/72/11;
                 http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Guang:2016:NMN,
  author =       "Suo Guang",
  title =        "{NR-MPI}: A Non-stop and Fault Resilient {MPI}
                 Supporting Programmer Defined Data Backup and Restore
                 for {E}-scale Super Computing Systems",
  journal =      j-SUPERFRI,
  volume =       "3",
  number =       "1",
  pages =        "4--21",
  month =        "????",
  year =         "2016",
  CODEN =        "????",
  ISSN =         "2409-6008 (print), 2313-8734 (electronic)",
  bibdate =      "Sat Nov 11 07:15:27 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/superfri.bib",
  URL =          "http://superfri.org/superfri/article/view/89",
  acknowledgement = ack-nhfb,
  fjournal =     "Supercomputing Frontiers and Innovations",
  journal-URL =  "http://superfri.org/superfri/issue/archive",
}

@Article{Hamidouche:2016:CAO,
  author =       "Khaled Hamidouche and Akshay Venkatesh and Ammar Ahmad
                 Awan and Hari Subramoni and Ching-Hsiang Chu and
                 Dhabaleswar K. Panda",
  title =        "{CUDA}-Aware {OpenSHMEM}: Extensions and Designs for
                 High Performance {OpenSHMEM} on {GPU} Clusters",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "58",
  number =       "??",
  pages =        "27--36",
  month =        oct,
  year =         "2016",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Sep 27 08:00:38 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819116300345",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Haque:2016:ACV,
  author =       "Syed Arefinul Haque and Salekul Islam and Md. Jahidul
                 Islam and Jean-Charles Gr{\'e}goire",
  title =        "An architecture for client virtualization: a case
                 study",
  journal =      j-COMP-NET-AMSTERDAM,
  volume =       "100",
  number =       "??",
  pages =        "75--89",
  day =          "8",
  month =        may,
  year =         "2016",
  CODEN =        "????",
  ISSN =         "1389-1286 (print), 1872-7069 (electronic)",
  ISSN-L =       "1389-1286",
  bibdate =      "Thu May 12 08:55:09 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compnetamsterdam2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S1389128616300421",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Networks (Amsterdam, Netherlands: 1999)",
  journal-URL =  "http://www.sciencedirect.com/science/journal/13891286/",
}

@Article{Hariri:2016:PPA,
  author =       "F. Hariri and T. M. Tran and A. Jocksch and E. Lanti
                 and J. Progsch and P. Messmer and S. Brunner and C.
                 Gheller and L. Villard",
  title =        "A portable platform for accelerated {PIC} codes and
                 its application to {GPUs} using {OpenACC}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "207",
  number =       "??",
  pages =        "69--82",
  month =        oct,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Aug 30 18:08:51 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465516301242",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Hu:2016:CLG,
  author =       "Liang Hu and Xilong Che and Si-Qing Zheng",
  title =        "A Closer Look at {GPGPU}",
  journal =      j-COMP-SURV,
  volume =       "48",
  number =       "4",
  pages =        "60:1--60:??",
  month =        may,
  year =         "2016",
  CODEN =        "CMSVAN",
  DOI =          "https://doi.org/10.1145/2873053",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Mon May 2 16:19:12 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/surveys/;
                 http://www.math.utah.edu/pub/tex/bib/compsurv.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "The lack of detailed white box illustration leaves a
                 gap in the field of GPGPU (General-Purpose Computing on
                 the Graphic Processing Unit), thus hindering users and
                 researchers from exploring hardware potential while
                 improving application performance. This article bridges
                 the gap by demystifying the micro-architecture and
                 operating mechanism of GPGPU. We propose a descriptive
                 model that addresses key issues of most concerns,
                 including task organization, hardware structure,
                 scheduling mechanism, execution mechanism, and memory
                 access. We also validate the effectiveness of our model
                 by interpreting the software/hardware cooperation of
                 CUDA.",
  acknowledgement = ack-nhfb,
  articleno =    "60",
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
}

@Article{Hung:2016:EBP,
  author =       "Che-Lun Hung and Chun-Yuan Lin and Chia-Shin Ou and
                 Yuan-Hong Tseng and Po-Yen Hung and Ship-Peng Li and
                 Chun-Ting Fu",
  title =        "Efficient bit-parallel subcircuit extraction using
                 {CUDA}",
  journal =      j-CCPE,
  volume =       "28",
  number =       "16",
  pages =        "4326--4338",
  month =        nov,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3732",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Nov 17 07:11:02 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Hunold:2016:RMB,
  author =       "Sascha Hunold and Alexandra Carpen-Amarie",
  title =        "Reproducible {MPI} Benchmarking is Still Not as Easy
                 as You Think",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "12",
  pages =        "3617--3630",
  month =        dec,
  year =         "2016",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Nov 16 18:43:09 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2016/12/07426807-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Ibanez:2016:HMT,
  author =       "Dan Ibanez and Ian Dunn and Mark S. Shephard",
  title =        "Hybrid {MPI}-thread parallelization of adaptive mesh
                 operations",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "52",
  number =       "??",
  pages =        "133--143",
  month =        feb,
  year =         "2016",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 12 18:56:20 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819116000041",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Iida:2016:GET,
  author =       "Yuki Iida and Yusuke Fujii and Takuya Azumi and
                 Nobuhiko Nishio and Shinpei Kato",
  title =        "{GPUrpc}: Exploring Transparent Access to Remote
                 {GPUs}",
  journal =      j-TECS,
  volume =       "16",
  number =       "1",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2950056",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Nov 3 16:48:38 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Graphics processing units (GPUs) are increasingly used
                 for high-performance computing. Programming frameworks
                 for general-purpose computing on GPUs (GPGPU), such as
                 CUDA and OpenCL, are also maturing. Driving this trend
                 is the recent proliferation of mobile devices such as
                 smartphones and wearable computers. These devices are
                 increasingly incorporating computationally intensive
                 applications that involve some form of environmental
                 recognition such as augmented reality (AR) or voice
                 recognition. However, devices with low computational
                 power cannot satisfy such demanding computing
                 requirements. The CPU load of these devices could be
                 reduced by offloading computation onto GPUs on the
                 cloud. This paper presents GPUrpc, a remote procedure
                 call (RPC) extension to Gdev, which is a rich set of
                 runtime libraries and device drivers for achieving
                 first-class GPU resource management. GPUrpc allows
                 developers to use CUDA for GPGPU development work.
                 Existing research uses RPCs based on the CUDA
                 application programming interfaces (APIs); hence, all
                 CUDA APIs require communication. To reduce
                 communication overhead, we use an RPC based on a
                 low-level API than CUDA API and reduced API that does
                 not require communication. Our evaluation conducted on
                 Linux and NVIDIA GPUs shows that the basic performance
                 of our prototype implementation is reliable in
                 comparison with the existing method. Evaluation using
                 the Rodinia benchmark suite designed for research in
                 heterogeneous parallel computing showed that GPUrpc is
                 effective for applications such as image processing and
                 data mining. GPUrpc also can improve power consumption
                 to approximately 1/6 that of CPU processing for
                 performing $ 512 \times 512 $ matrix multiplication.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
  remark =       "Special issue on VIPES, special issue on ICESS2015 and
                 regular papers.",
}

@Article{Ilie:2016:AEC,
  author =       "Silvana Ilie and Arne Storjohann",
  title =        "Abstracts of the {2015 East Coast Computer Algebra
                 Day}",
  journal =      j-ACM-COMM-COMP-ALGEBRA,
  volume =       "50",
  number =       "1",
  pages =        "35--39",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2930964.2930969",
  ISSN =         "1932-2232 (print), 1932-2240 (electronic)",
  ISSN-L =       "1932-2232",
  bibdate =      "Wed Apr 27 16:14:51 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigsam.bib",
  abstract =     "In the past decade, the introduction of low-level
                 heterogeneous programming models, in particular CUDA,
                 has brought supercomputing to the level of the desktop
                 computer. However, these models bring notable
                 challenges, even to expert programmers. Indeed, fully
                 exploiting the power of hardware accelerators with
                 CUDA-like code often requires significant code
                 optimization e.ort. While this development can
                 certainly yield high performance, it is desirable for
                 some programmers to avoid the explicit management of
                 device initialization and data transfer between memory
                 levels. To this end, high-level models for accelerator
                 programming, like OpenMP and OpenACC, have become an
                 important research direction. With these models,
                 programmers only need to annotate their C/C++ code to
                 indicate which code portion is to be executed on the
                 device and how data maps between host and device.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Communications in Computer Algebra",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1000",
}

@Article{Islam:2016:EMT,
  author =       "Tanzima Islam and Kathryn Mohror and Martin Schulz",
  title =        "Exploring the {MPI} tool information interface:
                 features and capabilities",
  journal =      j-IJHPCA,
  volume =       "30",
  number =       "2",
  pages =        "212--222",
  year =         "2016",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342015600507",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Apr 4 14:51:30 MDT 2017",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/full/10.1177/1094342015600507",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      may,
}

@Article{Kannan:2016:HPP,
  author =       "Ramakrishnan Kannan and Grey Ballard and Haesun Park",
  title =        "A high-performance parallel algorithm for nonnegative
                 matrix factorization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851152",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-negative matrix factorization (NMF) is the problem
                 of determining two non-negative low rank factors W and
                 H, for the given input matrix A, such that A \approx
                 WH. NMF is a useful tool for many applications in
                 different domains such as topic modeling in text
                 mining, background separation in video analysis, and
                 community detection in social networks. Despite its
                 popularity in the data mining community, there is a
                 lack of efficient distributed algorithms to solve the
                 problem for big data sets. We propose a
                 high-performance distributed-memory parallel algorithm
                 that computes the factorization by iteratively solving
                 alternating non-negative least squares (NLS)
                 subproblems for W and H. It maintains the data and
                 factor matrices in memory (distributed across
                 processors), uses MPI for interprocessor communication,
                 and, in the dense case, provably minimizes
                 communication costs (under mild assumptions). As
                 opposed to previous implementations, our algorithm is
                 also flexible: (1) it performs well for both dense and
                 sparse matrices, and (2) it allows the user to choose
                 any one of the multiple algorithms for solving the
                 updates to low rank factors W and H within the
                 alternating iterations. We demonstrate the scalability
                 of our algorithm and compare it with baseline
                 implementations, showing significant performance
                 improvements.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Kim:2016:DOF,
  author =       "Junghyun Kim and Gangwon Jo and Jaehoon Jung and
                 Jungwon Kim and Jaejin Lee",
  title =        "A distributed {OpenCL} framework using redundant
                 computation and data replication",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "553--569",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908094",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Applications written solely in OpenCL or CUDA cannot
                 execute on a cluster as a whole. Most previous
                 approaches that extend these programming models to
                 clusters are based on a common idea: designating a
                 centralized host node and coordinating the other nodes
                 with the host for computation. However, the centralized
                 host node is a serious performance bottleneck when the
                 number of nodes is large. In this paper, we propose a
                 scalable and distributed OpenCL framework called
                 SnuCL-D for large-scale clusters. SnuCL-D's remote
                 device virtualization provides an OpenCL application
                 with an illusion that all compute devices in a cluster
                 are confined in a single node. To reduce the amount of
                 control-message and data communication between nodes,
                 SnuCL-D replicates the OpenCL host program execution
                 and data in each node. We also propose a new OpenCL
                 host API function and a queueing optimization technique
                 that significantly reduce the overhead incurred by the
                 previous centralized approaches. To show the
                 effectiveness of SnuCL-D, we evaluate SnuCL-D with a
                 microbenchmark and eleven benchmark applications on a
                 large-scale CPU cluster and a medium-scale GPU
                 cluster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Kobayashi:2016:HSV,
  author =       "Ryohei Kobayashi and Tomohiro Misono and Kenji Kise",
  title =        "A High-speed {Verilog} {HDL} Simulation Method using a
                 Lightweight Translator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "26--31",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039908",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:44 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Designing with Hardware Description Languages (HDLs)
                 is still the de facto standard way to develop
                 FPGA-based custom computing systems, and RTL simulation
                 is an important step in ensuring that the designed
                 hardware behavior meets the design specification. In
                 this paper, we propose a new high-speed Verilog HDL
                 simulation method. It is based on two previously
                 proposed techniques: ArchHDL and Pyverilog. ArchHDL is
                 used as a simulation engine in the method because the
                 RTL simulation provided by ArchHDL can be parallelized
                 with OpenMP. We use Pyverilog to develop a code
                 translator to convert Verilog HDL source code into
                 ArchHDL code, and due to this, the translator can be
                 realized and its implementation is lightweight. We
                 compare the proposed method with Synopsys VCS, and the
                 experimental results show that the RTL simulation
                 behavior and speed are same as that of Synopsys VCS and
                 up to 5.8x better respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Koitka:2016:NGA,
  author =       "Sven Koitka and Christoph M. Friedrich",
  title =        "\pkg{nmfgpu4R}: {GPU}-Accelerated Computation of the
                 Non-Negative Matrix Factorization {(NMF)} Using {CUDA}
                 Capable Hardware",
  journal =      j-R-JOURNAL,
  volume =       "8",
  number =       "2",
  pages =        "382--392",
  month =        dec,
  year =         "2016",
  DOI =          "https://doi.org/10.32614/rj-2016-053",
  ISSN =         "2073-4859",
  ISSN-L =       "2073-4859",
  bibdate =      "Fri May 21 06:58:41 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/rjournal.bib",
  URL =          "https://journal.r-project.org/archive/2016/RJ-2016-053",
  acknowledgement = ack-nhfb,
  fjournal =     "The R Journal",
  journal-URL =  "http://journal.r-project.org/",
}

@Article{Kolesnichenko:2016:CBG,
  author =       "Alexey Kolesnichenko and Christopher M. Poskitt and
                 Sebastian Nanz and Bertrand Meyer",
  title =        "Contract-based general-purpose {GPU} programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "75--84",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814216",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Using GPUs as general-purpose processors has
                 revolutionized parallel computing by offering, for a
                 large and growing set of algorithms, massive
                 data-parallelization on desktop machines. An obstacle
                 to widespread adoption, however, is the difficulty of
                 programming them and the low-level control of the
                 hardware required to achieve good performance. This
                 paper suggests a programming library, SafeGPU, that
                 aims at striking a balance between programmer
                 productivity and performance, by making GPU
                 data-parallel operations accessible from within a
                 classical object-oriented programming language. The
                 solution is integrated with the design-by-contract
                 approach, which increases confidence in functional
                 program correctness by embedding executable program
                 specifications into the program text. We show that our
                 library leads to modular and maintainable code that is
                 accessible to GPGPU non-experts, while providing
                 performance that is comparable with hand-written CUDA
                 code. Furthermore, runtime contract checking turns out
                 to be feasible, as the contracts can be executed on the
                 GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Kulkarni:2016:HAP,
  author =       "Kedar Kulkarni and Shreeya Badhe and Geetanjali
                 Gadre",
  title =        "{HCA} aware Parallel Communication Library: A
                 feasibility study for offloading {MPI} requirements",
  journal =      j-SUPERFRI,
  volume =       "3",
  number =       "3",
  pages =        "56--60",
  month =        "????",
  year =         "2016",
  CODEN =        "????",
  ISSN =         "2409-6008 (print), 2313-8734 (electronic)",
  bibdate =      "Sat Nov 11 07:15:27 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/superfri.bib",
  URL =          "http://superfri.org/superfri/article/view/109",
  acknowledgement = ack-nhfb,
  fjournal =     "Supercomputing Frontiers and Innovations",
  journal-URL =  "http://superfri.org/superfri/issue/archive",
}

@Article{Kutyniok:2016:SFD,
  author =       "Gitta Kutyniok and Wang-Q Lim and Rafael Reisenhofer",
  title =        "{ShearLab $3$D}: Faithful Digital Shearlet Transforms
                 Based on Compactly Supported Shearlets",
  journal =      j-TOMS,
  volume =       "42",
  number =       "1",
  pages =        "5:1--5:42",
  month =        feb,
  year =         "2016",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2740960",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Mar 1 17:07:56 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "Wavelets and their associated transforms are highly
                 efficient when approximating and analyzing
                 one-dimensional signals. However, multivariate signals
                 such as images or videos typically exhibit curvilinear
                 singularities, which wavelets are provably deficient in
                 sparsely approximating and also in analyzing in the
                 sense of, for instance, detecting their direction.
                 Shearlets are a directional representation system
                 extending the wavelet framework, which overcomes those
                 deficiencies. Similar to wavelets, shearlets allow a
                 faithful implementation and fast associated transforms.
                 In this article, we will introduce a comprehensive
                 carefully documented software package coined ShearLab
                 3D (www.ShearLab.org) and discuss its algorithmic
                 details. This package provides MATLAB code for a novel
                 faithful algorithmic realization of the 2D and 3D
                 shearlet transform (and their inverses) associated with
                 compactly supported universal shearlet systems
                 incorporating the option of using CUDA. We will present
                 extensive numerical experiments in 2D and 3D concerning
                 denoising, inpainting, and feature extraction,
                 comparing the performance of ShearLab 3D with similar
                 transform-based algorithms such as curvelets,
                 contourlets, or surfacelets. In the spirit of
                 reproducible research, all scripts are accessible on
                 www.ShearLab.org.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Laguna:2016:EEU,
  author =       "Ignacio Laguna and David F. Richards and Todd Gamblin
                 and Martin Schulz and Bronis R. de Supinski and Kathryn
                 Mohror and Howard Pritchard",
  title =        "Evaluating and extending user-level fault tolerance in
                 {MPI} applications",
  journal =      j-IJHPCA,
  volume =       "30",
  number =       "3",
  pages =        "305--319",
  year =         "2016",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342015623623",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Apr 4 14:51:30 MDT 2017",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/full/10.1177/1094342015623623",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      aug,
}

@Article{Langr:2016:ASM,
  author =       "Daniel Langr and Pavel Tvrdik and Ivan Simecek",
  title =        "{AQsort}: Scalable Multi-Array In-Place Sorting with
                 {OpenMP}",
  journal =      j-SCPE,
  volume =       "17",
  number =       "4",
  pages =        "369--391",
  month =        "????",
  year =         "2016",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:48 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1207",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Lashgar:2016:ESM,
  author =       "Ahmad Lashgar and Amirali Baniasadi",
  title =        "Employing Software-Managed Caches in {OpenACC}:
                 Opportunities and Benefits",
  journal =      j-TOMPECS,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:34",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2798724",
  ISSN =         "2376-3639 (print), 2376-3647 (electronic)",
  ISSN-L =       "2376-3639",
  bibdate =      "Thu Jun 15 12:29:10 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tompecs.bib",
  URL =          "http://dl.acm.org/citation.cfm?id=2798724",
  abstract =     "The OpenACC programming model has been developed to
                 simplify accelerator programming and improve
                 development productivity. In this article, we
                 investigate the main limitations faced by OpenACC in
                 harnessing all capabilities of GPU-like accelerators.
                 We build on our findings and discuss the opportunity to
                 exploit a software-managed cache as (i) a fast
                 communication medium and (ii) a cache for data reuse.
                 To this end, we propose a new directive and
                 communication model for OpenACC. Investigating several
                 benchmarks, we show that the proposed directive can
                 improve performance up to $ 2.54 \times $, and at the
                 cost of minor programming effort.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Modeling and Performance
                 Evaluation of Computing Systems (TOMPECS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J1525",
}

@Article{Lin:2016:VDF,
  author =       "Yu-Te Lin and Jenq-Kuen Lee",
  title =        "Vector data flow analysis for {SIMD} optimizations on
                 {OpenCL} programs",
  journal =      j-CCPE,
  volume =       "28",
  number =       "5",
  pages =        "1629--1654",
  day =          "10",
  month =        apr,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3714",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sun Apr 3 12:34:13 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "24 Oct 2015",
}

@Article{Liu:2016:MBM,
  author =       "Weifeng Liu and Michael Gerndt and Bin Gong",
  title =        "Model-based {MPI-IO} tuning with {Periscope} tuning
                 framework",
  journal =      j-CCPE,
  volume =       "28",
  number =       "1",
  pages =        "3--20",
  month =        jan,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3603",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:21 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 Aug 2015",
}

@Article{Lobeiras:2016:DEI,
  author =       "Jacobo Lobeiras and Margarita Amor and Ramon Doallo",
  title =        "Designing Efficient Index-Digit Algorithms for {CUDA}
                 {GPU} Architectures",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "5",
  pages =        "1331--1343",
  month =        may,
  year =         "2016",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2015.2450718",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Apr 15 13:45:22 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2016/05/07138631-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2016/05/07138631-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Loncar:2016:CPS,
  author =       "Vladimir Loncar and Antun Balaz and Aleksandar
                 Bogojevi{\'c} and Srdjan Skrbi{\'c} and Paulsamy
                 Muruganandam and Sadhan K. Adhikari",
  title =        "{CUDA} programs for solving the time-dependent dipolar
                 {Gross--Pitaevskii} equation in an anisotropic trap",
  journal =      j-COMP-PHYS-COMM,
  volume =       "200",
  number =       "??",
  pages =        "406--410",
  month =        mar,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu Jan 21 15:04:34 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515004361",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Loncar:2016:OOM,
  author =       "Vladimir Loncar and Luis E. Young-S. and Srdjan
                 Skrbi{\'c} and Paulsamy Muruganandam and Sadhan K.
                 Adhikari and Antun Balaz",
  title =        "{OpenMP}, {OpenMP\slash MPI}, and {CUDA\slash MPI} {C}
                 programs for solving the time-dependent dipolar
                 {Gross--Pitaevskii} equation",
  journal =      j-COMP-PHYS-COMM,
  volume =       "209",
  number =       "??",
  pages =        "190--196",
  month =        dec,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Oct 18 17:55:23 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465516302272",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Maleki:2016:HOT,
  author =       "Sepideh Maleki and Annie Yang and Martin Burtscher",
  title =        "Higher-order and tuple-based massively-parallel prefix
                 sums",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "539--552",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908089",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Prefix sums are an important parallel primitive,
                 especially in massively-parallel programs. This paper
                 discusses two orthogonal generalizations thereof, which
                 we call higher-order and tuple-based prefix sums.
                 Moreover, it describes and evaluates SAM, a
                 GPU-friendly algorithm for computing prefix sums and
                 other scans that directly supports higher orders and
                 tuple values. Its templated CUDA implementation unifies
                 all of these computations in a single 100-statement
                 kernel. SAM is communication-efficient in the sense
                 that it minimizes main-memory accesses. When computing
                 prefix sums of a million or more values, it outperforms
                 Thrust and CUDPP on both a Titan X and a K40 GPU. On
                 the Titan X, SAM reaches memory-copy speeds for large
                 input sizes, which cannot be surpassed. SAM outperforms
                 CUB, the currently fastest conventional prefix sum
                 implementation, by up to a factor of 2.9 on
                 eighth-order prefix sums and by up to a factor of 2.6
                 on eight-tuple prefix sums.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Mallon:2016:MUB,
  author =       "Dami{\'a}n A. Mall{\'o}n and Guillermo L. Taboada and
                 Lars Koesterke",
  title =        "{MPI} and {UPC} broadcast, scatter and gather
                 algorithms in {Xeon Phi}",
  journal =      j-CCPE,
  volume =       "28",
  number =       "8",
  pages =        "2322--2340",
  day =          "10",
  month =        jun,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3552",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Jun 8 06:47:20 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Manca:2016:CQI,
  author =       "Emanuele Manca and Andrea Manconi and Alessandro Orro
                 and Giuliano Armano and Luciano Milanesi",
  title =        "{CUDA-quicksort}: an improved {GPU}-based
                 implementation of quicksort",
  journal =      j-CCPE,
  volume =       "28",
  number =       "1",
  pages =        "21--43",
  month =        jan,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3611",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:21 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 Aug 2015",
}

@Article{Marendic:2016:NMR,
  author =       "P. Marendic and J. Lemeire and D. Vucinic and P.
                 Schelkens",
  title =        "A novel {MPI} reduction algorithm resilient to
                 imbalances in process arrival times",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "72",
  number =       "5",
  pages =        "1973--2013",
  month =        may,
  year =         "2016",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1707-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon May 30 09:17:38 MDT 2016",
  bibsource =    "http://link.springer.com/journal/11227/72/5;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-016-1707-x",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Book{Matloff:2016:PCD,
  author =       "Norman S. Matloff",
  title =        "Parallel Computing for Data Science: with Examples in
                 {R}, {C++} and {CUDA}",
  volume =       "28",
  publisher =    pub-CRC,
  address =      pub-CRC:adr,
  pages =        "xxiii + 324",
  year =         "2016",
  ISBN =         "1-4665-8701-6 (hardcover)",
  ISBN-13 =      "978-1-4665-8701-4 (hardcover)",
  LCCN =         "QA76.642 M37 2016",
  bibdate =      "Sat Jun 27 09:13:41 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Chapman and Hall/CRC: The R Series",
  URL =          "http://www.tandf.net/books/details/9781466587014",
  abstract =     "\booktitle{Parallel Computing for Data Science: With
                 Examples in R, C++ and CUDA} is one of the first
                 parallel computing books to concentrate exclusively on
                 parallel data structures, algorithms, software tools,
                 and applications in data science. It includes examples
                 not only from the classic ``$n$ observations, $p$
                 variables'' matrix format but also from time series,
                 network graph models, and numerous other structures
                 common in data science. The examples illustrate the
                 range of issues encountered in parallel programming.
                 With the main focus on computation, the book shows how
                 to compute on three types of platforms.",
  acknowledgement = ack-nhfb,
  subject =      "Parallel programming (Computer science); Electronic
                 data processing; R (Computer program language)",
  tableofcontents = "Preface \\
                 Author's Biography \\
                 1: Introduction to Parallel Processing in R \\
                 2: ``Why Is My Program So Slow?'': Obstacles to Speed
                 \\
                 3: Principles of Parallel Loop Scheduling \\
                 4: The Shared-Memory Paradigm: A Gentle Introduction
                 via R \\
                 5: The Shared-Memory Paradigm: C Level \\
                 6: The Shared-Memory Paradigm: GPUs \\
                 7: Thrust and Rth \\
                 8: The Message Passing Paradigm \\
                 9: MapReduce Computation \\
                 10: Parallel Sorting and Merging \\
                 11: Parallel Pre x Scan \\
                 12: Parallel Matrix Operations \\
                 13: Inherently Statistical Approaches: Subset Methods
                 \\
                 Appendix A: Review of Matrix Algebra \\
                 Appendix B: R Quick Start \\
                 Appendix C: Introduction to C for R Programmers \\
                 Back Cover",
}

@Article{Muddukrishna:2016:GGO,
  author =       "Ananya Muddukrishna and Peter A. Jonsson and Artur
                 Podobas and Mats Brorsson",
  title =        "Grain graphs: {OpenMP} performance analysis made
                 easy",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851156",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Average programmers struggle to solve performance
                 problems in OpenMP programs with tasks and parallel
                 for-loops. Existing performance analysis tools
                 visualize OpenMP task performance from the runtime
                 system's perspective where task execution is
                 interleaved with other tasks in an unpredictable order.
                 Problems with OpenMP parallel for-loops are similarly
                 difficult to resolve since tools only visualize
                 aggregate thread-level statistics such as load
                 imbalance without zooming into a per-chunk granularity.
                 The runtime system/threads oriented visualization
                 provides poor support for understanding problems with
                 task and chunk execution time, parallelism, and memory
                 hierarchy utilization, forcing average programmers to
                 rely on experts or use tedious trial-and-error tuning
                 methods for performance. We present grain graphs, a new
                 OpenMP performance analysis method that visualizes
                 grains --- computation performed by a task or a
                 parallel for-loop chunk instance --- and highlights
                 problems such as low parallelism, work inflation and
                 poor parallelization benefit at the grain level. We
                 demonstrate that grain graphs can quickly reveal
                 performance problems that are difficult to detect and
                 characterize in fine detail using existing
                 visualizations in standard OpenMP programs, simplifying
                 OpenMP performance analysis. This enables average
                 programmers to make portable optimizations for poor
                 performing OpenMP programs, reducing pressure on
                 experts and removing the need for tedious
                 trial-and-error tuning.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Misc{Munshi:2016:OCS,
  author =       "Aaftab Munshi and Lee Howes and Bartosz Sochacki and
                 {Khronos OpenCL Working Group}",
  title =        "The {OpenCL} {C} Specification Version: 2.0 Document
                 Revision: 33",
  howpublished = "Web document.",
  pages =        "205",
  day =          "13",
  month =        apr,
  year =         "2016",
  bibdate =      "Mon Apr 16 14:05:49 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.khronos.org/registry/OpenCL/specs/opencl-2.0-openclc.pdf",
  acknowledgement = ack-nhfb,
  remark =       "Section 6.1.3.2 Math Functions, pages 74ff, defines a
                 function repertoire extended beyond that of ISO C,
                 including {\tt acospi}, {\tt asinpi}, {\tt atanpi},
                 {\tt atan2pi}, {\tt cospi}, {\tt sinpi}, {\tt tanpi},
                 {\tt cospi}, {\tt fract}, {\tt lgamma\_r}, {\tt mad}
                 (approximation to {\tt a * b + c}), {\tt minmag}, {\tt
                 pown}, {\tt rootn}, {\tt sincos}, {\tt sinpi}, and {\tt
                 tanpi}.",
}

@Article{Nadal-Serrano:2016:PSC,
  author =       "Jose M. Nadal-Serrano and Marisa Lopez-Vallejo",
  title =        "A Performance Study of {CUDA UVM} versus Manual
                 Optimizations in a Real-World Setup: Application to a
                 {Monte Carlo} Wave-Particle Event-Based Interaction
                 Model",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "6",
  pages =        "1579--1588",
  month =        jun,
  year =         "2016",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2015.2463813",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Jun 14 09:25:28 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://csdl.computer.org/csdl/trans/td/2016/06/07175058-abs.html",
  abstract-URL = "http://csdl.computer.org/csdl/trans/td/2016/06/07175058-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Naumenko:2016:ACT,
  author =       "Mikhail A. Naumenko and Vyacheslav V. Samarin",
  title =        "Application of {CUDA} technology to calculation of
                 ground states of few-body nuclei by {Feynman}'s
                 continual integrals method",
  journal =      j-SUPERFRI,
  volume =       "3",
  number =       "2",
  pages =        "80--95",
  month =        "????",
  year =         "2016",
  CODEN =        "????",
  ISSN =         "2409-6008 (print), 2313-8734 (electronic)",
  bibdate =      "Sat Nov 11 07:15:27 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/superfri.bib",
  URL =          "http://superfri.org/superfri/article/view/102",
  acknowledgement = ack-nhfb,
  fjournal =     "Supercomputing Frontiers and Innovations",
  journal-URL =  "http://superfri.org/superfri/issue/archive",
}

@Article{Nogueira:2016:BBW,
  author =       "David Nogueira and Pedro Tomas and Nuno Roma",
  title =        "{BowMapCL}: {Burrows--Wheeler} Mapping on Multiple
                 Heterogeneous Accelerators",
  journal =      j-TCBB,
  volume =       "13",
  number =       "5",
  pages =        "926--938",
  month =        sep,
  year =         "2016",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2015.2495149",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Dec 30 16:19:30 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "The computational demand of exact-search procedures
                 has pressed the exploitation of parallel processing
                 accelerators to reduce the execution time of many
                 applications. However, this often imposes strict
                 restrictions in terms of the problem size and
                 implementation efforts, mainly due to their possibly
                 distinct architectures. To circumvent this limitation,
                 a new exact-search alignment tool BowMapCL based on the
                 Burrows--Wheeler Transform and FM-Index is presented.
                 Contrasting to other alternatives, BowMapCL is based on
                 a unified implementation using OpenCL, allowing the
                 exploitation of multiple and possibly different devices
                 e.g., NVIDIA, AMD/ATI, and Intel GPUs/APUs.
                 Furthermore, to efficiently exploit such heterogeneous
                 architectures, BowMapCL incorporates several techniques
                 to promote its performance and scalability, including
                 multiple buffering, work-queue task-distribution, and
                 dynamic load-balancing, together with index
                 partitioning, bit-encoding, and sampling. When compared
                 with state-of-the-art tools, the attained results
                 showed that BowMapCL using a single GPU is $ 2 \times $
                 to $ 7.5 \times $ faster than mainstream multi-threaded
                 CPU BWT-based aligners, like Bowtie, BWA, and SOAP2;
                 and up to $ 4 \times $ faster than the best performing
                 state-of-the-art GPU implementations namely, SOAP3 and
                 HPG-BWT. When multiple and completely distinct devices
                 are considered, BowMapCL efficiently scales the offered
                 throughput, ensuring a convenient load-balance of the
                 involved processing in the several distinct devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Oger:2016:DMM,
  author =       "G. Oger and D. {Le Touz{\'e}} and D. Guibert and M. de
                 Leffe and J. Biddiscombe and J. Soumagne and J.-G.
                 Piccinali",
  title =        "On distributed memory {MPI}-based parallelization of
                 {SPH} codes in massive {HPC} context",
  journal =      j-COMP-PHYS-COMM,
  volume =       "200",
  number =       "??",
  pages =        "1--14",
  month =        mar,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu Jan 21 15:04:34 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515003070",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Otten:2016:MOI,
  author =       "Matthew Otten and Jing Gong and Azamat Mametjanov and
                 Aaron Vose and John Levesque and Paul Fischer and Misun
                 Min",
  title =        "An {MPI\slash OpenACC} implementation of a high-order
                 electromagnetics solver with {GPUDirect}
                 communication",
  journal =      j-IJHPCA,
  volume =       "30",
  number =       "3",
  pages =        "320--334",
  month =        aug,
  year =         "2016",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Apr 4 14:51:30 MDT 2017",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Pai:2016:CTO,
  author =       "Sreepathi Pai and Keshav Pingali",
  title =        "A compiler for throughput optimization of graph
                 algorithms on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "1--19",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984015",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing high-performance GPU implementations of graph
                 algorithms can be challenging. In this paper, we argue
                 that three optimizations called throughput
                 optimizations are key to high-performance for this
                 application class. These optimizations describe a large
                 implementation space making it unrealistic for
                 programmers to implement them by hand. To address this
                 problem, we have implemented these optimizations in a
                 compiler that produces CUDA code from an
                 intermediate-level program representation called IrGL.
                 Compared to state-of-the-art handwritten CUDA
                 implementations of eight graph applications, code
                 generated by the IrGL compiler is up to 5.95x times
                 faster (median 1.4x) for five applications and never
                 more than 30\% slower for the others. Throughput
                 optimizations contribute an improvement up to 4.16x
                 (median 1.4x) to the performance of unoptimized IrGL
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Pang:2016:MKR,
  author =       "Yeyong Pang and Shaojun Wang and Yu Peng and Xiyuan
                 Peng and Nicholas J. Fraser and Philip H. W. Leong",
  title =        "A Microcoded Kernel Recursive Least Squares Processor
                 Using {FPGA} Technology",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2950061",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Kernel methods utilize linear methods in a nonlinear
                 feature space and combine the advantages of both.
                 Online kernel methods, such as kernel recursive least
                 squares (KRLS) and kernel normalized least mean squares
                 (KNLMS), perform nonlinear regression in a recursive
                 manner, with similar computational requirements to
                 linear techniques. In this article, an architecture for
                 a microcoded kernel method accelerator is described,
                 and high-performance implementations of sliding-window
                 KRLS, fixed-budget KRLS, and KNLMS are presented. The
                 architecture utilizes pipelining and vectorization for
                 performance, and microcoding for reusability. The
                 design can be scaled to allow tradeoffs between
                 capacity, performance, and area. The design is compared
                 with a central processing unit (CPU), digital signal
                 processor (DSP), and Altera OpenCL implementations. In
                 different configurations on an Altera Arria 10 device,
                 our SW-KRLS implementation delivers floating-point
                 throughput of approximately 16 GFLOPs, latency of 5.5 $
                 \mu $ s, and energy consumption of $ 10^{- 4} $ J,
                 these being improvements over a CPU by factors of 12,
                 17, and 24, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Peraza:2016:PGQ,
  author =       "Joshua Peraza and Ananta Tiwari and Michael Laurenzano
                 and Laura Carrington and Allan Snavely",
  title =        "{PMaC}'s green queue: a framework for selecting energy
                 optimal {DVFS} configurations in large scale {MPI}
                 applications",
  journal =      j-CCPE,
  volume =       "28",
  number =       "2",
  pages =        "211--231",
  month =        feb,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3184",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:21 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "27 Dec 2013",
}

@Article{Pirk:2016:VVA,
  author =       "Holger Pirk and Oscar Moll and Matei Zaharia and Sam
                 Madden",
  title =        "{Voodoo} --- a vector algebra for portable database
                 performance on modern hardware",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "9",
  number =       "14",
  pages =        "1707--1718",
  month =        oct,
  year =         "2016",
  CODEN =        "????",
  ISSN =         "2150-8097",
  bibdate =      "Wed Oct 12 10:14:56 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "In-memory databases require careful tuning and many
                 engineering tricks to achieve good performance. Such
                 database performance engineering is hard: a plethora of
                 data and hardware-dependent optimization techniques
                 form a design space that is difficult to navigate for a
                 skilled engineer --- even more so for a query compiler.
                 To facilitate performance-oriented design exploration
                 and query plan compilation, we present Voodoo, a
                 declarative intermediate algebra that abstracts the
                 detailed architectural properties of the hardware, such
                 as multi- or many-core architectures, caches and SIMD
                 registers, without losing the ability to generate
                 highly tuned code. Because it consists of a collection
                 of declarative, vector-oriented operations, Voodoo is
                 easier to reason about and tune than low-level C and
                 related hardware-focused extensions (Intrinsics,
                 OpenCL, CUDA, etc.). This enables our Voodoo compiler
                 to produce (OpenCL) code that rivals and even
                 outperforms the fastest state-of-the-art in memory
                 databases for both GPUs and CPUs. In addition, Voodoo
                 makes it possible to express techniques as diverse as
                 cache-conscious processing, predication and
                 vectorization (again on both GPUs and CPUs) with just a
                 few lines of code. Central to our approach is a novel
                 idea we termed control vectors, which allows a code
                 generating frontend to expose parallelism to the Voodoo
                 compiler in a abstract manner, enabling portable
                 performance across hardware platforms. We used Voodoo
                 to build an alternative backend for MonetDB, a popular
                 open-source in-memory database. Our backend allows
                 MonetDB to perform at the same level as highly tuned
                 in-memory databases, including HyPeR and Ocelot. We
                 also demonstrate Voodoo's usefulness when investigating
                 hardware conscious tuning techniques, assessing their
                 performance on different queries, devices and data.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Prabhakar:2016:GCH,
  author =       "Raghu Prabhakar and David Koeplinger and Kevin J.
                 Brown and HyoukJoong Lee and Christopher {De Sa} and
                 Christos Kozyrakis and Kunle Olukotun",
  title =        "Generating Configurable Hardware from Parallel
                 Patterns",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "651--665",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872415",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In recent years the computing landscape has seen an
                 increasing shift towards specialized accelerators.
                 Field programmable gate arrays (FPGAs) are particularly
                 promising for the implementation of these accelerators,
                 as they offer significant performance and energy
                 improvements over CPUs for a wide class of applications
                 and are far more flexible than fixed-function ASICs.
                 However, FPGAs are difficult to program. Traditional
                 programming models for reconfigurable logic use
                 low-level hardware description languages like Verilog
                 and VHDL, which have none of the productivity features
                 of modern software languages but produce very efficient
                 designs, and low-level software languages like C and
                 OpenCL coupled with high-level synthesis (HLS) tools
                 that typically produce designs that are far less
                 efficient. Functional languages with parallel patterns
                 are a better fit for hardware generation because they
                 provide high-level abstractions to programmers with
                 little experience in hardware design and avoid many of
                 the problems faced when generating hardware from
                 imperative languages. In this paper, we identify two
                 important optimizations for using parallel patterns to
                 generate efficient hardware: tiling and metapipelining.
                 We present a general representation of tiled parallel
                 patterns, and provide rules for automatically tiling
                 patterns and generating metapipelines. We demonstrate
                 experimentally that these optimizations result in
                 speedups up to 39.4$ \times $ on a set of benchmarks
                 from the data analytics domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Prades:2016:CAX,
  author =       "Javier Prades and Carlos Rea{\~n}o and Federico
                 Silla",
  title =        "{CUDA} acceleration for {Xen} virtual machines in
                 {InfiniBand} clusters with {rCUDA}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "35:1--35:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851181",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many data centers currently use virtual machines (VMs)
                 to achieve a more efficient usage of hardware
                 resources. However, current virtualization solutions,
                 such as Xen, do not easily provide graphics processing
                 unit (GPU) accelerators to applications running in the
                 virtualized domain with the flexibility usually
                 required in data centers (i.e., managing virtual GPU
                 instances and concurrently sharing them among several
                 VMs). Remote GPU virtualization frameworks such as the
                 rCUDA solution may address this problem. In this work
                 we analyze the use of the rCUDA framework to accelerate
                 scientific applications running inside Xen VMs. Results
                 show that the use of the rCUDA framework is a feasible
                 approach, featuring a very low overhead if an
                 InfiniBand fabric is already present in the cluster.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Rehman:2016:VMJ,
  author =       "Waqas Ur Rehman and Muhammad Sohaib Ayub and Junaid
                 Haroon Siddiqui",
  title =        "Verification of {MPI} {Java} programs using software
                 model checking",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "55:1--55:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851192",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Development of concurrent software requires the
                 programmer to be aware of non-determinism, data races,
                 and deadlocks. MPI (message passing interface) is a
                 popular standard for writing message oriented
                 distributed applications. Some messages in MPI systems
                 can be processed by one of the many machines and in
                 many possible orders. This non-determinism can affect
                 the result of an MPI application. The alternate results
                 may or may not be correct. To verify MPI applications,
                 we need to check all these possible orderings and use
                 an application specific oracle to decide if these
                 orderings give correct output. MPJ Express is an open
                 source Java implementation of the MPI standard. We
                 developed a Java based model of MPJ Express, where
                 processes are modeled as threads, and which can run
                 unmodified MPI Java programs on a single system. This
                 enabled us to adapt the Java PathFinder explicit state
                 software model checker (JPF) using a custom listener to
                 verify our model running real MPI Java programs. We
                 evaluated our approach using small examples where model
                 checking revealed message orders that would result in
                 incorrect system behavior.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Rico-Gallego:2016:EIL,
  author =       "Juan-Antonio Rico-Gallego and Juan-Carlos
                 D{\'\i}az-Mart{\'\i}n and Alexey L. Lastovetsky",
  title =        "Extending {$ \tau $}-Lop to model concurrent {MPI}
                 communications in multicore clusters",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "61",
  number =       "??",
  pages =        "66--82",
  month =        aug,
  year =         "2016",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Apr 27 09:38:59 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X16300346",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X/",
}

@Article{Sandes:2016:CIS,
  author =       "Edans Flavius de Oliveira Sandes and Guillermo Miranda
                 and Xavier Martorell and Eduard Ayguade and George
                 Teodoro and Alba Cristina Magalhaes Melo",
  title =        "{CUDAlign 4.0}: Incremental Speculative Traceback for
                 Exact Chromosome-Wide Alignment in {GPU} Clusters",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "10",
  pages =        "2838--2850",
  month =        oct,
  year =         "2016",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2016.2515597",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Sep 13 06:32:59 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2016/10/07374729-abs.html",
  abstract-URL = "https://www.computer.org/csdl/trans/td/2016/10/07374729-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Sandes:2016:MMA,
  author =       "Edans F. De O. Sandes and Guillermo Miranda and Xavier
                 Martorell and Eduard Ayguade and George Teodoro and
                 Alba C. M. A. {De Melo}",
  title =        "{MASA}: a Multiplatform Architecture for Sequence
                 Aligners with Block Pruning",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "28:1--28:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858656",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Biological sequence alignment is a very popular
                 application in Bioinformatics, used routinely
                 worldwide. Many implementations of biological sequence
                 alignment algorithms have been proposed for multicores,
                 GPUs, FPGAs and CellBEs. These implementations are
                 platform-specific; porting them to other systems
                 requires considerable programming effort. This article
                 proposes and evaluates MASA, a flexible and
                 customizable software architecture that enables the
                 execution of biological sequence alignment applications
                 with three variants (local, global, and semiglobal) in
                 multiple hardware/software platforms with block
                 pruning, which is able to reduce significantly the
                 amount of data processed. To attain our flexibility
                 goals, we also propose a generic version of block
                 pruning and developed multiple parallelization
                 strategies as building blocks, including a new
                 asynchronous dataflow-based parallelization, which may
                 be combined to implement efficient aligners in
                 different platforms. We provide four MASA aligner
                 implementations for multicores (OmpSs and OpenMP), GPU
                 (CUDA), and Intel Phi (OpenMP), showing that MASA is
                 very flexible. The evaluation of our generic block
                 pruning strategy shows that it significantly
                 outperforms the previously proposed block pruning,
                 being able to prune up to 66.5\% of the cells when
                 using the new dataflow-based parallelization
                 strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Sataric:2016:HOM,
  author =       "Bogdan Satari{\'c} and Vladimir Slavni{\'c} and
                 Aleksandar Beli{\'c} and Antun Balaz and Paulsamy
                 Muruganandam and Sadhan K. Adhikari",
  title =        "Hybrid {OpenMP\slash MPI} programs for solving the
                 time-dependent {Gross--Pitaevskii} equation in a fully
                 anisotropic trap",
  journal =      j-COMP-PHYS-COMM,
  volume =       "200",
  number =       "??",
  pages =        "411--417",
  month =        mar,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu Jan 21 15:04:34 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465515004440",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Schenck:2016:EPM,
  author =       "Wolfram Schenck and Salem {El Sayed} and Maciej
                 Foszczynski and Wilhelm Homberg and Dirk Pleiter",
  title =        "Evaluation and Performance Modeling of a Burst Buffer
                 Solution",
  journal =      j-OPER-SYS-REV,
  volume =       "50",
  number =       "3",
  pages =        "12--26",
  month =        dec,
  year =         "2016",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/3041710.3041714",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Feb 9 10:38:58 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/opersysrev.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Hierarchical storage architectures are required to
                 meet both, capacity and bandwidth requirements for
                 future high-end storage architectures. In this paper we
                 present the results of an evaluation of an emerging
                 technology, DataDirect Networks' (DDN) Infinite Memory
                 Engine (IME). IME allows to realize a fast buffer in
                 front of a large capacity storage system. We collected
                 benchmarking data with IOR and with the HPC application
                 NEST. The IOR bandwidth results show how well network
                 bandwidth towards such fast buffer can be exploited
                 compared to the external storage system. The NEST
                 benchmarks clearly demonstrate that IME can reduce
                 I/O-induced load imbalance between MPI ranks to a
                 minimum while speeding up I/O as a whole by a
                 considerable factor. In addition to these direct
                 measurements, a performance model for NEST is
                 developed. In combination with a generic and abstract
                 burst buffer architecture, this model generates
                 predictions about appropriate burst buffer and I/O
                 parameters to achieve specific performance goals for
                 NEST on HPC clusters of varying size. Specifically, it
                 is investigated in which parameter range burst buffers
                 are able to counteract the widening performance gap
                 between compute and I/O.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Soldado:2016:ECM,
  author =       "F{\'a}bio Soldado and Fernando Alexandre and Herv{\'e}
                 Paulino",
  title =        "Execution of compound multi-kernel {OpenCL}
                 computations in {multi-CPU\slash multi-GPU}
                 environments",
  journal =      j-CCPE,
  volume =       "28",
  number =       "3",
  pages =        "768--787",
  day =          "10",
  month =        mar,
  year =         "2016",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3612",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 9 06:13:22 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "28 Aug 2015",
}

@Article{Sorensen:2016:EER,
  author =       "Tyler Sorensen and Alastair F. Donaldson",
  title =        "Exposing errors related to weak memory in {GPU}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "100--113",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908114",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the systematic design of a testing
                 environment that uses stressing and fuzzing to reveal
                 errors in GPU applications that arise due to weak
                 memory effects. We evaluate our approach on seven GPUs
                 spanning three Nvidia architectures, across ten CUDA
                 applications that use fine-grained concurrency. Our
                 results show that applications that rarely or never
                 exhibit errors related to weak memory when executed
                 natively can readily exhibit these errors when executed
                 in our testing environment. Our testing environment
                 also provides a means to help identify the root causes
                 of such errors, and automatically suggests how to
                 insert fences that harden an application against weak
                 memory bugs. To understand the cost of GPU fences, we
                 benchmark applications with fences provided by the
                 hardening strategy as well as a more conservative,
                 sound fencing strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Sorensen:2016:PIW,
  author =       "Tyler Sorensen and Alastair F. Donaldson and Mark
                 Batty and Ganesh Gopalakrishnan and Zvonimir
                 Rakamari{\'c}",
  title =        "Portable inter-workgroup barrier synchronisation for
                 {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "39--58",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984032",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the growing popularity of GPGPU programming,
                 there is not yet a portable and formally-specified
                 barrier that one can use to synchronise across
                 workgroups. Moreover, the occupancy-bound execution
                 model of GPUs breaks assumptions inherent in
                 traditional software execution barriers, exposing them
                 to deadlock. We present an occupancy discovery protocol
                 that dynamically discovers a safe estimate of the
                 occupancy for a given GPU and kernel, allowing for a
                 starvation-free (and hence, deadlock-free)
                 inter-workgroup barrier by restricting the number of
                 workgroups according to this estimate. We implement
                 this idea by adapting an existing, previously
                 non-portable, GPU inter-workgroup barrier to use OpenCL
                 2.0 atomic operations, and prove that the barrier meets
                 its natural specification in terms of synchronisation.
                 We assess the portability of our approach over eight
                 GPUs spanning four vendors, comparing the performance
                 of our method against alternative methods. Our key
                 findings include: (1){\^A} the recall of our discovery
                 protocol is nearly 100\%; (2){\^A} runtime comparisons
                 vary substantially across GPUs and applications; and
                 (3){\^A} our method provides portable and safe
                 inter-workgroup synchronisation across the applications
                 we study.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Tampouratzis:2016:AIH,
  author =       "Nikolaos Tampouratzis and Pavlos M. Mattheakis and
                 Ioannis Papaefstathiou",
  title =        "Accelerating Intercommunication in Highly Parallel
                 Systems",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3005717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Every HPC system consists of numerous processing nodes
                 interconnect using a number of different inter-process
                 communication protocols such as Messaging Passing
                 Interface (MPI) and Global Arrays (GA). Traditionally,
                 research has focused on optimizing these protocols and
                 identifying the most suitable ones for each system
                 and/or application. Recently, there has been a proposal
                 to unify the primitive operations of the different
                 inter-processor communication protocols through the
                 Portals library. Portals offer a set of low-level
                 communication routines which can be composed in order
                 to implement the functionality of different
                 intercommunication protocols. However, Portals
                 modularity comes at a performance cost, since it adds
                 one more layer in the actual protocol implementation.
                 This work aims at closing the performance gap between a
                 generic and reusable intercommunication layer, such as
                 Portals, and the several monolithic and highly
                 optimized intercommunication protocols. This is
                 achieved through the development of a novel hardware
                 offload engine efficiently implementing the basic
                 Portals' modules. Our innovative system is up to two2
                 orders of magnitude faster than the conventional
                 software implementation of Portals' while the speedup
                 achieved over the conventional monolithic software
                 implementations of MPI and GAs is more than an order of
                 magnitude. The power consumption of our hardware system
                 is less than 1/100th of what a low-power CPU consumes
                 when executing the Portal's software while its silicon
                 cost is less than 1/10th of that of a very simple RISC
                 CPU. Moreover, our design process is also innovative
                 since we have first modeled the hardware within an
                 untimed virtual prototype which allowed for rapid
                 design space exploration; then we applied a novel
                 methodology to transform the untimed description into
                 an efficient timed hardware description, which was then
                 transformed into a hardware netlist through a
                 High-Level Synthesis (HLS) tool.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tang:2016:AKM,
  author =       "Qing Y. Tang and Mohammed A. S. Khalid",
  title =        "Acceleration of $k$-Means Algorithm Using {Altera SDK}
                 for {OpenCL}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2964910",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A K-means clustering algorithm involves partitioning
                 of data iteratively into k clusters. It is one of the
                 most popular data-mining algorithms [Wu et al. 2007],
                 and is widely used in other applications, such as image
                 processing and machine learning. However, k-means is
                 highly time-consuming when data or cluster size is
                 large. Traditionally, FPGAs have shown great promise
                 for accelerating computationally intensive algorithms,
                 but they are harder to use for acceleration if we rely
                 on traditional HD-based design methods. The recent
                 introduction of Altera SDK for the OpenCL high-level
                 synthesis tool allows developers to utilize FPGA's
                 potential without long development periods and
                 extensive hardware knowledge. This article presents an
                 optimized implementation of a k-means clustering
                 algorithm on an FPGA using Altera SDK for OpenCL.
                 Performance and power consumption is measured with
                 various data, cluster, and dimension sizes. When
                 compared to state-of-the-art solutions, this
                 implementation supports larger cluster sizes, offers up
                 to 21x speed over a CPU and is more power efficient
                 than a GPU. Unlike previous implementations, it can
                 deliver consistently high throughput across large or
                 small feature dimensions given reasonable cluster sizes
                 and large enough data size.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Vega-Gisbert:2016:DIJ,
  author =       "Oscar Vega-Gisbert and Jose E. Roman and Jeffrey M.
                 Squyres",
  title =        "Design and implementation of {Java} bindings in {Open
                 MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "59",
  number =       "??",
  pages =        "1--20",
  month =        nov,
  year =         "2016",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Nov 26 12:06:01 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819116300758",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Wang:2016:LLA,
  author =       "Jin Wang and Norm Rubin and Albert Sidelnik and
                 Sudhakar Yalamanchili",
  title =        "{LaPerm}: locality aware scheduler for dynamic
                 parallelism on {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "583--595",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001199",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent developments in GPU execution models and
                 architectures have introduced dynamic parallelism to
                 facilitate the execution of irregular applications
                 where control flow and memory behavior can be
                 unstructured, time-varying, and hierarchical. The
                 changes brought about by this extension to the
                 traditional bulk synchronous parallel (BSP) model also
                 creates new challenges in exploiting the current GPU
                 memory hierarchy. One of the major challenges is that
                 the reference locality that exists between the parent
                 and child thread blocks (TBs) created during dynamic
                 nested kernel and thread block launches cannot be fully
                 leveraged using the current TB scheduling strategies.
                 These strategies were designed for the current
                 implementations of the BSP model but fall short when
                 dynamic parallelism is introduced since they are
                 oblivious to the hierarchical reference locality. We
                 propose LaPerm, a new locality-aware TB scheduler that
                 exploits such parent-child locality, both spatial and
                 temporal. LaPerm adopts three different scheduling
                 decisions to (i) prioritize the execution of the child
                 TBs, (ii) bind them to the stream multiprocessors (SMXs)
                 occupied by their parents TBs, and (iii) maintain
                 workload balance across compute units. Experiments with
                 a set of irregular CUDA applications executed on a
                 cycle-level simulator employing dynamic parallelism
                 demonstrate that LaPerm is able to achieve an average
                 of 27\% performance improvement over the baseline
                 round-robin TB scheduler commonly used in modern
                 GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Wang:2016:MMF,
  author =       "Zeke Wang and Shuhao Zhang and Bingsheng He and Wei
                 Zhang",
  title =        "{Melia}: A {MapReduce} Framework on {OpenCL}-Based
                 {FPGAs}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "27",
  number =       "12",
  pages =        "3547--3560",
  month =        dec,
  year =         "2016",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Nov 16 18:43:09 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2016/12/07425227-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Witchel:2016:PPW,
  author =       "Emmett Witchel",
  title =        "Programmer Productivity in a World of Mushy
                 Interfaces: Challenges of the Post-{ISA} Reality",
  journal =      j-OPER-SYS-REV,
  volume =       "50",
  number =       "2",
  pages =        "591--591",
  month =        jun,
  year =         "2016",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/2954680.2876511",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Jun 9 17:03:34 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/opersysrev.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Since 1964, we had the notion that the instruction set
                 architecture (ISA) is a useful and fairly opaque
                 abstraction layer between hardware and software.
                 Software rode hardware's performance wave while
                 remaining gloriously oblivious to hardware's growing
                 complexity. Unfortunately, the jig is up. We still have
                 ISAs, but the abstraction no longer offers seamless
                 portability---parallel software needs to be tuned for
                 different core counts, and heterogeneous processing
                 elements (CPUs, GPUs, accelerators) further complicate
                 programmability. We are better at building large-scale
                 heterogeneous processors than we are at programming
                 them. Maintaining software across multiple current
                 platforms is difficult and porting to future platforms
                 is also difficult. There have been many technical
                 responses: virtual ISAs (e.g., NVIDIA's PTX),
                 higher-level programming interfaces (e.g., CUDA or
                 OpenCL), and late-stage compilation and
                 platform-specific tailoring (e.g., Android ART), etc. A
                 team of opinionated experts, drawn from the three
                 ASPLOS communities will examine the problem of
                 programmer productivity in the post-ISA world, first
                 from the perspective of their area of expertise and
                 then noting the contributions from the other two
                 communities. What research will save us and how? This
                 wide-ranging debate will frame important research areas
                 for future work while being grounded in frank
                 discussion about what has succeeded in the past.
                 Attendees can expect actionable insight into important
                 research issues as well an entertaining discussion.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Yang:2016:HTM,
  author =       "Fan Yang and Jinfeng Li and James Cheng",
  title =        "{Husky}: towards a more efficient and expressive
                 distributed computing framework",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "9",
  number =       "5",
  pages =        "420--431",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  ISSN =         "2150-8097",
  bibdate =      "Mon Jan 11 17:54:24 MST 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "Finding efficient, expressive and yet intuitive
                 programming models for data-parallel computing system
                 is an important and open problem. Systems like Hadoop
                 and Spark have been widely adopted for massive data
                 processing, as coarse-grained primitives like map and
                 reduce are succinct and easy to master. However,
                 sometimes over-simplified API hinders programmers from
                 more fine-grained control and designing more efficient
                 algorithms. Developers may have to resort to
                 sophisticated domain-specific languages (DSLs), or even
                 low-level layers like MPI, but this raises development
                 cost---learning many mutually exclusive systems
                 prolongs the development schedule, and the use of
                 low-level tools may result in bug-prone programming.
                 This motivated us to start the Husky open-source
                 project, which is an attempt to strike a better balance
                 between high performance and low development cost.
                 Husky is developed mainly for in-memory large scale
                 data mining, and also serves as a general research
                 platform for designing efficient distributed
                 algorithms. We show that many existing frameworks can
                 be easily implemented and bridged together inside
                 Husky, and Husky is able to achieve similar or even
                 better performance compared with domain-specific
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Young-S:2016:OFP,
  author =       "Luis E. Young-S. and Dusan Vudragovi{\'c} and Paulsamy
                 Muruganandam and Sadhan K. Adhikari and Antun Balaz",
  title =        "{OpenMP Fortran} and {C} programs for solving the
                 time-dependent {Gross--Pitaevskii} equation in an
                 anisotropic trap",
  journal =      j-COMP-PHYS-COMM,
  volume =       "204",
  number =       "??",
  pages =        "209--213",
  month =        jul,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri May 13 19:25:21 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/fortran2.bib;
                 http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S001046551630073X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Zaza:2016:CBP,
  author =       "Ayham Zaza and Abeeb A. Awotunde and Faisal A. Fairag
                 and Mayez A. Al-Mouhamed",
  title =        "A {CUDA} based parallel multi-phase oil reservoir
                 simulator",
  journal =      j-COMP-PHYS-COMM,
  volume =       "206",
  number =       "??",
  pages =        "2--16",
  month =        sep,
  year =         "2016",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jun 10 18:27:25 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465516300996",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Agullo:2017:BGB,
  author =       "Emmanuel Agullo and Olivier Aumage and Berenger Bramas
                 and Olivier Coulaud and Samuel Pitoiset",
  title =        "Bridging the Gap Between {OpenMP} and Task-Based
                 Runtime Systems for the {Fast Multipole Method}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "28",
  number =       "10",
  pages =        "2794--2807",
  month =        oct,
  year =         "2017",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2017.2697857",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Oct 12 06:58:12 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2017/10/07912335-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Al-Refaie:2017:PAH,
  author =       "Ahmed F. Al-Refaie and Jonathan Tennyson",
  title =        "A parallel algorithm for {Hamiltonian} matrix
                 construction in electron-molecule collision
                 calculations: {MPI--SCATCI}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "221",
  number =       "??",
  pages =        "53--62",
  month =        dec,
  year =         "2017",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Oct 16 14:20:16 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517302436",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Al-Refaie:2017:PCT,
  author =       "Ahmed F. Al-Refaie and Sergei N. Yurchenko and
                 Jonathan Tennyson",
  title =        "{{\bf G}PU {\bf A}ccelerated {\bf IN}tensities MPI
                 (GAIN-MPI)}: a new method of computing {Einstein-$A$}
                 coefficients",
  journal =      j-COMP-PHYS-COMM,
  volume =       "214",
  number =       "??",
  pages =        "216--224",
  month =        may,
  year =         "2017",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2017.01.013",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Mar 3 06:05:58 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517300255",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Aliaga:2017:CTP,
  author =       "Jos{\'e} I. Aliaga and Mar{\'\i}a Barreda and Goran
                 Flegar and Matthias Bollh{\"o}fer and Enrique S.
                 Quintana-Ort{\'\i}",
  title =        "Communication in task-parallel {ILU}-preconditioned
                 {CG} solvers using {MPI + OmpSs}",
  journal =      j-CCPE,
  volume =       "29",
  number =       "21",
  pages =        "??--??",
  day =          "10",
  month =        nov,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4280",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Dec 30 09:11:58 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Alvanos:2017:PMM,
  author =       "Michail Alvanos and Theodoros Christoudias",
  title =        "\pkg{MEDINA}: {MECCA} Development in Accelerators ---
                 {KPP Fortran} to {CUDA} source-to-source
                 Pre-processor",
  journal =      j-J-OPEN-RES-SOFT,
  volume =       "5",
  number =       "1",
  pages =        "13--??",
  day =          "28",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.5334/jors.158",
  ISSN =         "2049-9647",
  ISSN-L =       "2049-9647",
  bibdate =      "Sat Sep 8 10:03:50 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/jors.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://openresearchsoftware.metajnl.com/articles/10.5334/jors.158/",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Open Research Software",
  journal-URL =  "https://openresearchsoftware.metajnl.com/issue/archive/",
}

@Article{Anderson:2017:BGB,
  author =       "Michael Anderson and Shaden Smith and Narayanan
                 Sundaram and Mihai Capota and Zheguang Zhao and
                 Subramanya Dulloor and Nadathur Satish and Theodore L.
                 Willke",
  title =        "Bridging the gap between {HPC} and big data
                 frameworks",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "10",
  number =       "8",
  pages =        "901--912",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3090163.3090168",
  ISSN =         "2150-8097",
  bibdate =      "Fri Jun 23 17:12:46 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "Apache Spark is a popular framework for data analytics
                 with attractive features such as fault tolerance and
                 interoperability with the Hadoop ecosystem.
                 Unfortunately, many analytics operations in Spark are
                 an order of magnitude or more slower compared to native
                 implementations written with high performance computing
                 tools such as MPI. There is a need to bridge the
                 performance gap while retaining the benefits of the
                 Spark ecosystem such as availability, productivity, and
                 fault tolerance. In this paper, we propose a system for
                 integrating MPI with Spark and analyze the costs and
                 benefits of doing so for four distributed graph and
                 machine learning applications. We show that offloading
                 computation to an MPI environment from within Spark
                 provides 3.1--17.7$ \times $ speedups on the four
                 sparse applications, including all of the overheads.
                 This opens up an avenue to reuse existing MPI libraries
                 in Spark with little effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Arteaga:2017:GFG,
  author =       "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R.
                 Gao",
  title =        "Generating Fine-Grain Multithreaded Applications Using
                 a Multigrain Approach",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155288",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The recent evolution in hardware landscape, aimed at
                 producing high-performance computing systems capable of
                 reaching extreme-scale performance, has reignited the
                 interest in fine-grain multithreading, particularly at
                 the intranode level. Indeed, popular parallel
                 programming environments, such as OpenMP, which
                 features a simple interface for the parallelization of
                 programs, are now incorporating fine-grain constructs.
                 However, since coarse-grain directives are still
                 heavily used, the OpenMP runtime is forced to support
                 both coarse- and fine-grain models of execution,
                 potentially reducing the advantages obtained when
                 executing an application in a fully fine-grain
                 environment. To evaluate the type of applications that
                 benefit from executing in a unified fine-grain program
                 execution model, this article presents a multigrain
                 parallel programming environment for the generation of
                 fine-grain multithreaded applications from programs
                 featuring OpenMP's API, allowing OpenMP programs to be
                 run on top of a fine-grain event-driven program
                 execution model. Experimental results with five
                 scientific benchmarks show that fine-grain
                 applications, generated by and run on our environment
                 with two runtimes implementing a fine-grain
                 event-driven program execution model, are competitive
                 and can outperform their OpenMP counterparts,
                 especially for data-intensive workloads with irregular
                 and dynamic parallelism, reaching speedups as high as
                 2.6$ \times $ for Graph500 and 51$ \times $ for NAS
                 Data Cube.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Awan:2017:CCD,
  author =       "Ammar Ahmad Awan and Khaled Hamidouche and Jahanzeb
                 Maqbool Hashmi and Dhabaleswar K. Panda",
  title =        "{S-Caffe}: Co-designing {MPI} Runtimes and {Caffe} for
                 Scalable Deep Learning on Modern {GPU} Clusters",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "193--205",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018769",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Availability of large data sets like ImageNet and
                 massively parallel computation support in modern HPC
                 devices like NVIDIA GPUs have fueled a renewed interest
                 in Deep Learning (DL) algorithms. This has triggered
                 the development of DL frameworks like Caffe, Torch,
                 TensorFlow, and CNTK. However, most DL frameworks have
                 been limited to a single node. In order to scale out DL
                 frameworks and bring HPC capabilities to the DL arena,
                 we propose, S-Caffe; a scalable and distributed Caffe
                 adaptation for modern multi-GPU clusters. With an
                 in-depth analysis of new requirements brought forward
                 by the DL frameworks and limitations of current
                 communication runtimes, we present a co-design of the
                 Caffe framework and the MVAPICH2-GDR MPI runtime. Using
                 the co-design methodology, we modify Caffe's workflow
                 to maximize the overlap of computation and
                 communication with multi-stage data propagation and
                 gradient aggregation schemes. We bring DL-Awareness to
                 the MPI runtime by proposing a hierarchical reduction
                 design that benefits from CUDA-Aware features and
                 provides up to a massive 133x speedup over OpenMPI and
                 2.6x speedup over MVAPICH2 for 160 GPUs. S-Caffe
                 successfully scales up to 160 K-80 GPUs for GoogLeNet
                 (ImageNet) with a speedup of 2.5x over 32 GPUs. To the
                 best of our knowledge, this is the first framework that
                 scales up to 160 GPUs. Furthermore, even for single
                 node training, S-Caffe shows an improvement of 14\% and
                 9\% over Nvidia's optimized Caffe for 8 and 16 GPUs,
                 respectively. In addition, S-Caffe achieves up to 1395
                 samples per second for the AlexNet model, which is
                 comparable to the performance of Microsoft CNTK.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Bae:2017:SEF,
  author =       "Seung-Hee Bae and Daniel Halperin and Jevin D. West
                 and Martin Rosvall and Bill Howe",
  title =        "Scalable and Efficient Flow-Based Community Detection
                 for Large-Scale Graph Analysis",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "32:1--32:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2992785",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community detection is an increasingly popular
                 approach to uncover important structures in large
                 networks. Flow-based community detection methods rely
                 on communication patterns of the network rather than
                 structural properties to determine communities. The
                 Infomap algorithm in particular optimizes a novel
                 objective function called the map equation and has been
                 shown to outperform other approaches in third-party
                 benchmarks. However, Infomap and its variants are
                 inherently sequential, limiting their use for
                 large-scale graphs. In this article, we propose a novel
                 algorithm to optimize the map equation called RelaxMap.
                 RelaxMap provides two important improvements over
                 Infomap: parallelization, so that the map equation can
                 be optimized over much larger graphs, and
                 prioritization, so that the most important work occurs
                 first, iterations take less time, and the algorithm
                 converges faster. We implement these techniques using
                 OpenMP on shared-memory multicore systems, and evaluate
                 our approach on a variety of graphs from standard graph
                 clustering benchmarks as well as real graph datasets.
                 Our evaluation shows that both techniques are
                 effective: RelaxMap achieves 70\% parallel efficiency
                 on eight cores, and prioritization improves algorithm
                 performance by an additional 20--50\% on average,
                 depending on the graph properties. Additionally,
                 RelaxMap converges in the similar number of iterations
                 and provides solutions of equivalent quality as the
                 serial Infomap implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Barthels:2017:DJA,
  author =       "Claude Barthels and Ingo M{\"u}ller and Timo Schneider
                 and Gustavo Alonso and Torsten Hoefler",
  title =        "Distributed join algorithms on thousands of cores",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "10",
  number =       "5",
  pages =        "517--528",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  ISSN =         "2150-8097",
  bibdate =      "Sat Feb 25 09:01:51 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "Traditional database operators such as joins are
                 relevant not only in the context of database engines
                 but also as a building block in many computational and
                 machine learning algorithms. With the advent of big
                 data, there is an increasing demand for efficient join
                 algorithms that can scale with the input data size and
                 the available hardware resources. In this paper, we
                 explore the implementation of distributed join
                 algorithms in systems with several thousand cores
                 connected by a low-latency network as used in high
                 performance computing systems or data centers. We
                 compare radix hash join to sort-merge join algorithms
                 and discuss their implementation at this scale. In the
                 paper, we explain how to use MPI to implement joins,
                 show the impact and advantages of RDMA, discuss the
                 importance of network scheduling, and study the
                 relative performance of sorting vs. hashing. The
                 experimental results show that the algorithms we
                 present scale well with the number of cores, reaching a
                 throughput of 48.7 billion input tuples per second on
                 4,096 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Bonelli:2017:MCA,
  author =       "Francesco Bonelli and Michele Tuttafesta and Gianpiero
                 Colonna and Luigi Cutrone and Giuseppe Pascazio",
  title =        "An {MPI--CUDA} approach for hypersonic flows with
                 detailed state-to-state air kinetics using a {GPU}
                 cluster",
  journal =      j-COMP-PHYS-COMM,
  volume =       "219",
  number =       "??",
  pages =        "178--195",
  month =        oct,
  year =         "2017",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Jul 26 06:22:13 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517301613",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Bruel:2017:ACC,
  author =       "Pedro Bruel and Marcos Amar{\'\i}s and Alfredo
                 Goldman",
  title =        "Autotuning {CUDA} compiler parameters for
                 heterogeneous applications using the {OpenTuner}
                 framework",
  journal =      j-CCPE,
  volume =       "29",
  number =       "22",
  pages =        "??--??",
  day =          "25",
  month =        nov,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3973",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Dec 30 09:11:59 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Carpen-Amarie:2017:EOC,
  author =       "Alexandra Carpen-Amarie and Sascha Hunold and Jesper
                 Larsson Tr{\"a}ff",
  title =        "On expected and observed communication performance
                 with {MPI} derived datatypes",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "69",
  number =       "??",
  pages =        "98--117",
  month =        nov,
  year =         "2017",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Oct 24 15:15:02 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819117301217",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Celik:2017:BET,
  author =       "Ahmet Celik and Sreepathi Pai and Sarfraz Khurshid and
                 Milos Gligoric",
  title =        "Bounded exhaustive test-input generation on {GPUs}",
  journal =      j-PACMPL,
  volume =       "1",
  number =       "OOPSLA",
  pages =        "94:1--94:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3133918",
  ISSN =         "2475-1421",
  bibdate =      "Wed Jan 10 09:45:26 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  articleno =    "94",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Chabbi:2017:EAL,
  author =       "Milind Chabbi and Abdelhalim Amer and Shasha Wen and
                 Xu Liu",
  title =        "An Efficient Abortable-locking Protocol for
                 Multi-level {NUMA} Systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "61--74",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018768",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The popularity of Non-Uniform Memory Access (NUMA)
                 architectures has led to numerous locality-preserving
                 hierarchical lock designs, such as HCLH, HMCS, and
                 cohort locks. Locality-preserving locks trade fairness
                 for higher throughput. Hence, some instances of
                 acquisitions can incur long latencies, which may be
                 intolerable for certain applications. Few locks admit a
                 waiting thread to abandon its protocol on a timeout.
                 State-of-the-art abortable locks are not fully locality
                 aware, introduce high overheads, and unsuitable for
                 frequent aborts. Enhancing locality-aware locks with
                 lightweight timeout capability is critical for their
                 adoption. In this paper, we design and evaluate the
                 HMCS-T lock, a Hierarchical MCS (HMCS) lock variant
                 that admits a timeout. HMCS-T maintains the locality
                 benefits of HMCS while ensuring aborts to be
                 lightweight. HMCS-T offers the progress guarantee
                 missing in most abortable queuing locks. Our
                 evaluations show that HMCS-T offers the timeout feature
                 at a moderate overhead over its HMCS analog. HMCS-T,
                 used in an MPI runtime lock, mitigated the poor
                 scalability of an MPI+OpenMP BFS code and resulted in
                 4.3x superior scaling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Chen:2017:AAG,
  author =       "Jian Chen and Russell M. Clapp",
  title =        "{Astro}: Auto-Generation of Synthetic Traces Using
                 Scaling Pattern Recognition for {MPI} Workloads",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "28",
  number =       "8",
  pages =        "2159--2171",
  month =        aug,
  year =         "2017",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2017.2649518",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Jul 25 18:46:21 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2017/08/07809142-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Cornelis:2017:HAV,
  author =       "Jan G. Cornelis and Jan Lemeire and Tim Bruylants and
                 Peter Schelkens",
  title =        "Heterogeneous acceleration of volumetric {JPEG 2000}
                 using {OpenCL}",
  journal =      j-IJHPCA,
  volume =       "31",
  number =       "3",
  pages =        "229--245",
  year =         "2017",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342016646438",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Nov 6 06:13:05 MST 2018",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/full/10.1177/1094342016646438",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  xxmonth =      may,
}

@Article{Dang:2017:ECB,
  author =       "Hoang-Vu Dang and Marc Snir and William Gropp",
  title =        "Eliminating contention bottlenecks in multithreaded
                 {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "69",
  number =       "??",
  pages =        "1--23",
  month =        nov,
  year =         "2017",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Oct 24 15:15:02 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819117301187",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dashti:2017:AMM,
  author =       "Mohammad Dashti and Alexandra Fedorova",
  title =        "Analyzing memory management methods on integrated
                 {CPU--GPU} systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "59--69",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092256",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous systems that integrate a multicore CPU
                 and a GPU on the same die are ubiquitous. On these
                 systems, both the CPU and GPU share the same physical
                 memory as opposed to using separate memory dies.
                 Although integration eliminates the need to copy data
                 between the CPU and the GPU, arranging transparent
                 memory sharing between the two devices can carry large
                 overheads. Memory on CPU/GPU systems is typically
                 managed by a software framework such as OpenCL or CUDA,
                 which includes a runtime library, and communicates with
                 a GPU driver. These frameworks offer a range of memory
                 management methods that vary in ease of use,
                 consistency guarantees and performance. In this study,
                 we analyze some of the common memory management methods
                 of the most widely used software frameworks for
                 heterogeneous systems: CUDA, OpenCL 1.2, OpenCL 2.0,
                 and HSA, on NVIDIA and AMD hardware. We focus on
                 performance/functionality trade-offs, with the goal of
                 exposing their performance impact and simplifying the
                 choice of memory management methods for programmers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{deAndrade:2017:OFH,
  author =       "Douglas Coimbra de Andrade and Lu{\'\i}s Gonzaga
                 Trabasso",
  title =        "An {OpenCL} framework for high performance extraction
                 of image features",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "109",
  number =       "??",
  pages =        "75--88",
  month =        nov,
  year =         "2017",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Aug 19 13:10:32 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731517301624",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Degomme:2017:SMA,
  author =       "Augustin Degomme and Arnaud Legrand and George S.
                 Markomanolis and Martin Quinson and Mark Stillwell and
                 Frederic Suter",
  title =        "Simulating {MPI} Applications: The {SMPI} Approach",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "28",
  number =       "8",
  pages =        "2387--2400",
  month =        aug,
  year =         "2017",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2017.2669305",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Jul 25 18:46:21 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2017/08/07855780-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Diavastos:2017:SLR,
  author =       "Andreas Diavastos and Pedro Trancoso",
  title =        "{SWITCHES}: a Lightweight Runtime for Dataflow
                 Execution of Tasks on Many-Cores",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "31:1--31:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127068",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "SWITCHES is a task-based dataflow runtime that
                 implements a lightweight distributed triggering system
                 for runtime dependence resolution and uses static
                 scheduling and compile-time assignment policies to
                 reduce runtime overheads. Unlike other systems, the
                 granularity of loop-tasks can be increased to favor
                 data-locality, even when having dependences across
                 different loops. SWITCHES introduces explicit task
                 resource allocation mechanisms for efficient allocation
                 of resources and adopts the latest OpenMP Application
                 Programming Interface (API), as to maintain high levels
                 of programming productivity. It provides a
                 source-to-source tool that automatically produces
                 thread-based code. Performance on an Intel Xeon-Phi
                 shows good scalability and surpasses OpenMP by an
                 average of 32\%.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dietrich:2017:CBA,
  author =       "Robert Dietrich and Felix Schmitt and Alexander Grund
                 and Jonas Stolle",
  title =        "Critical-blame analysis for {OpenMP 4.0} offloading on
                 {Intel Xeon Phi}",
  journal =      j-J-SYST-SOFTW,
  volume =       "125",
  number =       "??",
  pages =        "381--388",
  month =        mar,
  year =         "2017",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Sat Feb 4 12:20:39 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "//www.sciencedirect.com/science/article/pii/S0164121215002940",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Systems and Software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212/",
}

@Article{Eizenberg:2017:BBL,
  author =       "Ariel Eizenberg and Yuanfeng Peng and Toma Pigli and
                 William Mansky and Joseph Devietti",
  title =        "{BARRACUDA}: binary-level analysis of runtime {RAces}
                 in {CUDA} programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "126--140",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062342",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "GPU programming models enable and encourage massively
                 parallel programming with over a million threads,
                 requiring extreme parallelism to achieve good
                 performance. Massive parallelism brings significant
                 correctness challenges by increasing the possibility
                 for bugs as the number of thread interleavings
                 balloons. Conventional dynamic safety analyses struggle
                 to run at this scale. We present BARRACUDA, a
                 concurrency bug detector for GPU programs written in
                 Nvidia's CUDA language. BARRACUDA handles a wider range
                 of parallelism constructs than previous work, including
                 branch operations, low-level atomics and memory fences,
                 which allows BARRACUDA to detect new classes of
                 concurrency bugs. BARRACUDA operates at the binary
                 level for increased compatibility with existing code,
                 leveraging a new binary instrumentation framework that
                 is extensible to other dynamic analyses. BARRACUDA
                 incorporates a number of novel optimizations that are
                 crucial for scaling concurrency bug detection to over a
                 million threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Fachada:2017:CCF,
  author =       "Nuno Fachada and Vitor V. Lopes and Rui C. Martins and
                 Agostinho C. Rosa",
  title =        "{\tt cf4ocl}: a {C} framework for {OpenCL}",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "143",
  number =       "??",
  pages =        "9--19",
  day =          "1",
  month =        sep,
  year =         "2017",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Wed Jul 26 05:56:44 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167642317300540",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
}

@Article{Falch:2017:MLB,
  author =       "Thomas L. Falch and Anne C. Elster",
  title =        "Machine learning-based auto-tuning for enhanced
                 performance portability of {OpenCL} applications",
  journal =      j-CCPE,
  volume =       "29",
  number =       "8",
  pages =        "??--??",
  day =          "25",
  month =        apr,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4029",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Fri Mar 31 19:12:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Falch:2017:RAM,
  author =       "Thomas L. Falch and Anne C. Elster",
  title =        "Machine learning-based auto-tuning for enhanced
                 performance portability of {OpenCL} applications",
  journal =      j-CCPE,
  volume =       "29",
  number =       "8",
  pages =        "??--??",
  day =          "25",
  month =        apr,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4029",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Fri Mar 31 19:12:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Fan:2017:SEE,
  author =       "Xing Fan and Mostafa Mehrabi and Oliver Sinnen and
                 Nasser Giacaman",
  title =        "Supporting Enhanced Exception Handling with {OpenMP}
                 in Object--Oriented Languages",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "6",
  pages =        "1366--1389",
  month =        dec,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0474-x",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Nov 18 09:27:28 MST 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/6;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Forejt:2017:PPA,
  author =       "Vojt{\u{a}}ch Forejt and Saurabh Joshi and Daniel
                 Kroening and Ganesh Narayanaswamy and Subodh Sharma",
  title =        "Precise Predictive Analysis for Discovering
                 Communication Deadlocks in {MPI} Programs",
  journal =      j-TOPLAS,
  volume =       "39",
  number =       "4",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2017",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/3095075",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Sep 19 06:38:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "The Message Passing Interface (MPI) is the standard
                 API for parallelization in high-performance and
                 scientific computing. Communication deadlocks are a
                 frequent problem in MPI programs, and this article
                 addresses the problem of discovering such deadlocks. We
                 begin by showing that if an MPI program is single path,
                 the problem of discovering communication deadlocks is
                 NP-complete. We then present a novel propositional
                 encoding scheme that captures the existence of
                 communication deadlocks. The encoding is based on
                 modeling executions with partial orders and implemented
                 in a tool called MOPPER. The tool executes an MPI
                 program, collects the trace, builds a formula from the
                 trace using the propositional encoding scheme, and
                 checks its satisfiability. Finally, we present
                 experimental results that quantify the benefit of the
                 approach in comparison to other analyzers and
                 demonstrate that it offers a scalable solution for
                 single-path programs.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Frust:2017:RDP,
  author =       "Tobias Frust and Michael Wagner and Jan Stephan and
                 Guido Juckeland and Andr{\'e} Bieberle",
  title =        "Rapid data processing for ultrafast {X}-ray computed
                 tomography using scalable and modular {CUDA} based
                 pipelines",
  journal =      j-COMP-PHYS-COMM,
  volume =       "219",
  number =       "??",
  pages =        "353--360",
  month =        oct,
  year =         "2017",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Jul 26 06:22:13 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517301674",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Fumero:2017:JTG,
  author =       "Juan Fumero and Michel Steuwer and Lukas Stadler and
                 Christophe Dubach",
  title =        "Just-In-Time {GPU} Compilation for Interpreted
                 Languages with Partial Evaluation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "60--73",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050761",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computer systems are increasingly featuring powerful
                 parallel devices with the advent of many-core CPUs and
                 GPUs. This offers the opportunity to solve
                 computationally-intensive problems at a fraction of the
                 time traditional CPUs need. However, exploiting
                 heterogeneous hardware requires the use of low-level
                 programming language approaches such as OpenCL, which
                 is incredibly challenging, even for advanced
                 programmers. On the application side, interpreted
                 dynamic languages are increasingly becoming popular in
                 many domains due to their simplicity, expressiveness
                 and flexibility. However, this creates a wide gap
                 between the high-level abstractions offered to
                 programmers and the low-level hardware-specific
                 interface. Currently, programmers must rely on high
                 performance libraries or they are forced to write parts
                 of their application in a low-level language like
                 OpenCL. Ideally, nonexpert programmers should be able
                 to exploit heterogeneous hardware directly from their
                 interpreted dynamic languages. In this paper, we
                 present a technique to transparently and automatically
                 offload computations from interpreted dynamic languages
                 to heterogeneous devices. Using just-in-time
                 compilation, we automatically generate OpenCL code at
                 runtime which is specialized to the actual observed
                 data types using profiling information. We demonstrate
                 our technique using R, which is a popular interpreted
                 dynamic language predominately used in big data
                 analytic. Our experimental results show the execution
                 on a GPU yields speedups of over 150x compared to the
                 sequential FastR implementation and the obtained
                 performance is competitive with manually written GPU
                 code. We also show that when taking into account
                 start-up time, large speedups are achievable, even when
                 the applications run for as little as a few seconds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Germanas:2017:HUP,
  author =       "D. Germanas and A. Stepsys and S. Mickevicius and R.
                 K. Kalinauskas",
  title =        "{HOTB} update: Parallel code for calculation of three-
                 and four-particle harmonic oscillator transformation
                 brackets and their matrices using {OpenMP}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "215",
  number =       "??",
  pages =        "259--264",
  month =        jun,
  year =         "2017",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Mar 31 15:52:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517300401",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Ghose:2017:FOT,
  author =       "Anirban Ghose and Lokesh Dokara and Soumyajit Dey and
                 Pabitra Mitra",
  title =        "A Framework for {OpenCL} Task Scheduling on
                 Heterogeneous Multicores",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "27",
  number =       "3--4",
  pages =        "1750008",
  year =         "2017",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626417500086",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Tue May 29 09:05:31 MDT 2018",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Gonzalez-Alvarez:2017:HMO,
  author =       "David L. Gonz{\'a}lez-{\'A}lvarez and Miguel A.
                 Vega-Rodr{\'\i}guez and {\'A}lvaro Rubio-Largo",
  title =        "A hybrid {MPI\slash OpenMP} parallel implementation of
                 {NSGA--II} for finding patterns in protein sequences",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "6",
  pages =        "2285--2312",
  month =        jun,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1916-3",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:33 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/6;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Grosset:2017:TTT,
  author =       "A. V. Pascal Grosset and Manasa Prasad and Cameron
                 Christensen and Aaron Knoll and Charles Hansen",
  title =        "{TOD}-Tree: Task-Overlapped Direct Send Tree Image
                 Compositing for Hybrid {MPI} Parallelism and {GPUs}",
  journal =      j-IEEE-TRANS-VIS-COMPUT-GRAPH,
  volume =       "23",
  number =       "6",
  pages =        "1677--1690",
  month =        jun,
  year =         "2017",
  CODEN =        "ITVGEA",
  DOI =          "https://doi.org/10.1109/TVCG.2016.2542069",
  ISSN =         "1077-2626 (print), 1941-0506 (electronic), 2160-9306",
  ISSN-L =       "1077-2626",
  bibdate =      "Thu Jun 29 18:38:25 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/tg/2017/06/07433468-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
}

@Article{Han:2017:SLS,
  author =       "Yiming Han and Anthony T. Chronopoulos",
  title =        "Scalable Loop Self-scheduling Schemes for Large-Scale
                 Clusters and Cloud Systems",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "3",
  pages =        "595--611",
  month =        jun,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0434-5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jun 24 11:37:59 MDT 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/3;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Haque:2017:CCL,
  author =       "S. Anisul Haque and X. Li and F. Mansouri and M.
                 Moreno Maza and D. Mohajerani and W. Pan",
  title =        "{CUMODP}: a {CUDA} library for modular polynomial
                 computation",
  journal =      j-ACM-COMM-COMP-ALGEBRA,
  volume =       "51",
  number =       "3",
  pages =        "89--91",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177795.3177799",
  ISSN =         "1932-2232 (print), 1932-2240 (electronic)",
  ISSN-L =       "1932-2232",
  bibdate =      "Fri Jan 5 06:22:51 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigsam.bib",
  abstract =     "The CUDA Modular Polynomial (CUMODP) Library
                 implements arithmetic operations for dense matrices and
                 dense polynomials, primarily with modular integer
                 coefficients. Some operations are available for integer
                 or floating point coefficients. Similar to other
                 software libraries, like CuBLAS$^1$ targeting Graphics
                 Processing Units (GPUs), CUMODP focuses on
                 efficiency-critical routines and provides them in the
                 form of device functions and CUDA kernels. Hence, these
                 routines are primarily designed to offer GPU support to
                 polynomial system solvers. A bivariate system solver is
                 part of the library, as a proof-of-concept. Its
                 implementation is presented in [10] and it is
                 integrated in Maple's Triangularize command$^2$, since
                 the release 18 of Maple.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Communications in Computer Algebra",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1000",
}

@Article{Hasanov:2017:HRC,
  author =       "Khalid Hasanov and Alexey Lastovetsky",
  title =        "Hierarchical redesign of classic {MPI} reduction
                 algorithms",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "2",
  pages =        "713--725",
  month =        feb,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1779-7",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:32 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/2;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Henriksen:2017:FPF,
  author =       "Troels Henriksen and Niels G. W. Serup and Martin
                 Elsman and Fritz Henglein and Cosmin E. Oancea",
  title =        "{Futhark}: purely functional {GPU-programming} with
                 nested parallelism and in-place array updates",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "556--571",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062354",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Futhark is a purely functional data-parallel array
                 language that offers a machine-neutral programming
                 model and an optimising compiler that generates OpenCL
                 code for GPUs. This paper presents the design and
                 implementation of three key features of Futhark that
                 seek a suitable middle ground with imperative
                 approaches. First, in order to express efficient code
                 inside the parallel constructs, we introduce a simple
                 type system for in-place updates that ensures
                 referential transparency and supports equational
                 reasoning. Second, we furnish Futhark with parallel
                 operators capable of expressing efficient
                 strength-reduced code, along with their fusion rules.
                 Third, we present a flattening transformation aimed at
                 enhancing the degree of parallelism that (i) builds on
                 loop interchange and distribution but uses higher-order
                 reasoning rather than array-dependence analysis, and
                 (ii) still allows further locality-of-reference
                 optimisations. Finally, an evaluation on 16 benchmarks
                 demonstrates the impact of the language and compiler
                 features and shows application-level performance
                 competitive with hand-written GPU code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Jan:2017:ITF,
  author =       "Bilal Jan and Fiaz Gul Khan and Bartolomeo Montrucchio
                 and Anthony Theodore Chronopoulos and Shahaboddin
                 Shamshirband and Abdul Nasir Khan",
  title =        "Introducing {ToPe--FFT}: An {OpenCL}-based {FFT}
                 library targeting {GPUs}",
  journal =      j-CCPE,
  volume =       "29",
  number =       "21",
  pages =        "??--??",
  day =          "10",
  month =        nov,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4256",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Dec 30 09:11:58 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Jarzabek:2017:PEU,
  author =       "Lukasz Jarzabek and Pawel Czarnul",
  title =        "Performance evaluation of unified memory and dynamic
                 parallelism for selected parallel {CUDA} applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "12",
  pages =        "5378--5401",
  month =        dec,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2091-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jan 6 08:59:18 MST 2018",
  bibsource =    "http://link.springer.com/journal/11227/73/12;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/s11227-017-2091-x.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Jatala:2017:SSG,
  author =       "Vishwesh Jatala and Jayvant Anantpur and Amey
                 Karkare",
  title =        "Scratchpad Sharing in {GPUs}",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3075619",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "General-Purpose Graphics Processing Unit (GPGPU)
                 applications exploit on-chip scratchpad memory
                 available in the Graphics Processing Units (GPUs) to
                 improve performance. The amount of thread level
                 parallelism (TLP) present in the GPU is limited by the
                 number of resident threads, which in turn depends on
                 the availability of scratchpad memory in its streaming
                 multiprocessor (SM). Since the scratchpad memory is
                 allocated at thread block granularity, part of the
                 memory may remain unutilized. In this article, we
                 propose architectural and compiler optimizations to
                 improve the scratchpad memory utilization. Our
                 approach, called Scratchpad Sharing, addresses
                 scratchpad under-utilization by launching additional
                 thread blocks in each SM. These thread blocks use
                 unutilized scratchpad memory and also share scratchpad
                 memory with other resident blocks. To improve the
                 performance of scratchpad sharing, we propose Owner
                 Warp First (OWF) scheduling that schedules warps from
                 the additional thread blocks effectively. The
                 performance of this approach, however, is limited by
                 the availability of the part of scratchpad memory that
                 is shared among thread blocks. We propose compiler
                 optimizations to improve the availability of shared
                 scratchpad memory. We describe an allocation scheme
                 that helps in allocating scratchpad variables such that
                 shared scratchpad is accessed for short duration. We
                 introduce a new hardware instruction, relssp, that when
                 executed releases the shared scratchpad memory.
                 Finally, we describe an analysis for optimal placement
                 of relssp instructions, such that shared scratchpad
                 memory is released as early as possible, but only after
                 its last use, along every execution path. We
                 implemented the hardware changes required for
                 scratchpad sharing and the relssp instruction using the
                 GPGPU-Sim simulator and implemented the compiler
                 optimizations in Ocelot framework. We evaluated the
                 effectiveness of our approach on 19 kernels from 3
                 benchmarks suites: CUDA-SDK, GPGPU-Sim, and Rodinia.
                 The kernels that under-utilize scratchpad memory show
                 an average improvement of 19\% and maximum improvement
                 of 92.17\% in terms of the number of instruction
                 executed per cycle when compared to the baseline
                 approach, without affecting the performance of the
                 kernels that are not limited by scratchpad memory.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jo:2017:PMA,
  author =       "Gangwon Jo and Jaehoon Jung and Jiyoung Park and
                 Jaejin Lee",
  title =        "{Poster}: {MAPA}: an Automatic Memory Access Pattern
                 Analyzer for {GPU} Applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "443--444",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019034",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Various existing optimization and memory consistency
                 management techniques for GPU applications rely on
                 memory access patterns of kernels. However, they suffer
                 from poor practicality because they require explicit
                 user interventions to extract kernel memory access
                 patterns. This paper proposes an automatic
                 memory-access-pattern analysis framework called MAPA.
                 MAPA is based on a source-level analysis technique
                 derived from traditional symbolic analyses and a
                 run-time pattern selection technique. The experimental
                 results show that MAPA properly analyzes 116 real-world
                 OpenCL kernels from Rodinia and Parboil.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Julian-Moreno:2017:FPA,
  author =       "Guillermo Juli{\'a}n-Moreno and Jorge E. L{\'o}pez de
                 Vergara and Iv{\'a}n Gonz{\'a}lez and Luis de Pedro and
                 Javier Royuela-del-Val and Federico
                 Simmross-Wattenberg",
  title =        "Fast parallel $ \alpha $-stable distribution function
                 evaluation and parameter estimation using {OpenCL} in
                 {GPGPUs}",
  journal =      j-STAT-COMPUT,
  volume =       "27",
  number =       "5",
  pages =        "1365--1382",
  month =        sep,
  year =         "2017",
  CODEN =        "STACE3",
  ISSN =         "0960-3174 (print), 1573-1375 (electronic)",
  ISSN-L =       "0960-3174",
  bibdate =      "Thu Jun 8 18:03:56 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/statcomput.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Statistics and Computing",
  journal-URL =  "http://link.springer.com/journal/11222",
}

@Article{Katouda:2017:MOH,
  author =       "Michio Katouda and Takahito Nakajima",
  title =        "{MPI\slash OpenMP} hybrid parallel algorithm for
                 resolution of identity second-order
                 {M{\o}ller--Plesset} perturbation calculation of
                 analytical energy gradient for massively parallel
                 multicore supercomputers",
  journal =      j-J-COMPUT-CHEM,
  volume =       "38",
  number =       "8",
  pages =        "489--507",
  day =          "30",
  month =        mar,
  year =         "2017",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.24701",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Mon Feb 20 11:51:05 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0192-8651",
}

@Article{Khan:2017:RCS,
  author =       "Ayaz H. Khan and Mayez Al-Mouhamed and Muhammed
                 Al-Mulhem and Adel F. Ahmed",
  title =        "{RT-CUDA}: A Software Tool for {CUDA} Code
                 Restructuring",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "3",
  pages =        "551--594",
  month =        jun,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0433-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jun 24 11:37:59 MDT 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/3;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Kojima:2017:HLG,
  author =       "Kensuke Kojima and Atsushi Igarashi",
  title =        "A {Hoare} Logic for {GPU} Kernels",
  journal =      j-TOCL,
  volume =       "18",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001834",
  ISSN =         "1529-3785 (print), 1557-945X (electronic)",
  ISSN-L =       "1529-3785",
  bibdate =      "Thu Apr 13 17:53:54 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocl/;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tocl.bib",
  abstract =     "We study a Hoare Logic to reason about parallel
                 programs executed on graphics processing units (GPUs),
                 called GPU kernels. During the execution of GPU
                 kernels, multiple threads execute in lockstep, that is,
                 execute the same instruction simultaneously. When the
                 control branches, the two branches are executed
                 sequentially, but during the execution of each branch
                 only those threads that take it are enabled; after the
                 control converges, all the threads are enabled and
                 again execute in lockstep. In this article, we first
                 consider a semantics in which all threads execute in
                 lockstep (this semantics simplifies the actual
                 execution model of GPUs) and adapt Hoare Logic to this
                 setting by augmenting the usual Hoare triples with an
                 additional component representing the set of enabled
                 threads. It is determined that the soundness and
                 relative completeness of the logic do not hold for all
                 programs; a difficulty arises from the fact that one
                 thread can invalidate the loop termination condition of
                 another thread through shared memory. We overcome this
                 difficulty by identifying an appropriate class of
                 programs for which the soundness and relative
                 completeness hold. Additionally, we discuss thread
                 interleaving, which is present in the actual execution
                 of GPUs but not in the lockstep semantics mentioned
                 above. We show that if a program is race free, then the
                 lockstep and interleaving semantics produce the same
                 result. This implies that our logic is sound and
                 relatively complete for race-free programs, even if the
                 thread interleaving is taken into account.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computational Logic",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J773",
}

@Article{Kotselidis:2017:HMR,
  author =       "Christos Kotselidis and James Clarkson and Andrey
                 Rodchenko and Andy Nisbet and John Mawer and Mikel
                 Luj{\'a}n",
  title =        "Heterogeneous Managed Runtime Systems: a Computer
                 Vision Case Study",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "74--82",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Real-time 3D space understanding is becoming prevalent
                 across a wide range of applications and hardware
                 platforms. To meet the desired Quality of Service
                 (QoS), computer vision applications tend to be heavily
                 parallelized and exploit any available hardware
                 accelerators. Current approaches to achieving real-time
                 computer vision, evolve around programming languages
                 typically associated with High Performance Computing
                 along with binding extensions for OpenCL or CUDA
                 execution. Such implementations, although high
                 performing, lack portability across the wide range of
                 diverse hardware resources and accelerators. In this
                 paper, we showcase how a complex computer vision
                 application can be implemented within a managed runtime
                 system. We discuss the complexities of achieving
                 high-performing and portable execution across embedded
                 and desktop configurations. Furthermore, we demonstrate
                 that it is possible to achieve the QoS target of over
                 30 frames per second (FPS) by exploiting FPGA and GPGPU
                 acceleration transparently through the managed runtime
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Kouetcha:2017:USP,
  author =       "Daniella Nguemalieu Kouetcha and Hamidr{\'e}za
                 Ram{\'e}zani and Nathalie Cohaut",
  title =        "Ultrafast scalable parallel algorithm for the radial
                 distribution function histogramming using {MPI} maps",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "4",
  pages =        "1629--1653",
  month =        apr,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1854-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:33 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/4;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Li:2017:PCO,
  author =       "Shigang Li and Yunquan Zhang and Torsten Hoefler",
  title =        "{Poster}: Cache-Oblivious {MPI} All-to-All
                 Communications on Many-Core Architectures",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "445--446",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019025",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the many-core era, the performance of MPI
                 collectives is more dependent on the intra-node
                 communication component. However, the communication
                 algorithms generally inherit from the inter-node
                 version and ignore the cache complexity. We propose
                 cache-oblivious algorithms for MPI all-to-all
                 operations, in which data blocks are copied into the
                 receive buffers in Morton order to exploit data
                 locality. Experimental results on different many-core
                 architectures show that our cache-oblivious
                 implementations significantly outperform the naive
                 implementations based on shared heap and the highly
                 optimized MPI libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Losada:2017:ARV,
  author =       "Nuria Losada and Mar{\'\i}a J. Mart{\'\i}n and
                 Patricia Gonz{\'a}lez",
  title =        "Assessing resilient versus stop-and-restart
                 fault-tolerant solutions in {MPI} applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "1",
  pages =        "316--329",
  month =        jan,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1863-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:31 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Losada:2017:RMA,
  author =       "Nuria Losada and Iv{\'a}n Cores and Mar{\'\i}a J.
                 Mart{\'\i}n and Patricia Gonz{\'a}lez",
  title =        "Resilient {MPI} applications using an
                 application-level checkpointing framework and {ULFM}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "1",
  pages =        "100--113",
  month =        jan,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1629-7",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:31 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/1;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Maier:2017:OLD,
  author =       "Andrew J. Maier and Bruce F. Cockburn",
  title =        "Optimization of Low-Density Parity Check decoder
                 performance for {OpenCL} designs synthesized to
                 {FPGAs}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "107",
  number =       "??",
  pages =        "134--145",
  month =        sep,
  year =         "2017",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Aug 19 13:10:31 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731517301004",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Malakar:2017:DMO,
  author =       "Preeti Malakar and Venkatram Vishwanath",
  title =        "Data movement optimizations for independent {MPI}
                 {I/O} on the {Blue Gene/Q}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "61",
  number =       "??",
  pages =        "35--51",
  month =        jan,
  year =         "2017",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Feb 4 08:48:35 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S016781911630062X",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191/",
}

@Article{Manwade:2017:DFA,
  author =       "Karveer B. Manwade and Dinesh B. Kulkarni",
  title =        "Data Flow Analysis of {MPI} Program Using Dynamic
                 Analysis Technique with Partial Execution",
  journal =      j-SCPE,
  volume =       "18",
  number =       "4",
  pages =        "375--385",
  month =        "????",
  year =         "2017",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:49 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1335",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Marin:2017:ERF,
  author =       "Manuel Marin and David Defour and Federico Milano",
  title =        "An Efficient Representation Format for Fuzzy Intervals
                 Based on Symmetric Membership Functions",
  journal =      j-TOMS,
  volume =       "43",
  number =       "3",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2017",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2939364",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Oct 4 10:55:07 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=2939364",
  abstract =     "This article addresses the execution cost of
                 arithmetic operations with a focus on fuzzy arithmetic.
                 Thanks to an appropriate representation format for
                 fuzzy intervals, we show that it is possible to halve
                 the number of operations and divide by 2 to 8 the
                 memory requirements compared to conventional solutions.
                 In addition, we demonstrate the benefit of some
                 hardware features encountered in today's accelerators
                 (GPU) such as static rounding, memory usage,
                 instruction-level parallelism (ILP), and thread-level
                 parallelism (TLP). We then describe a library of fuzzy
                 arithmetic operations written in CUDA and C++. The
                 library is evaluated against traditional approaches
                 using compute-bound and memory-bound benchmarks on
                 Nvidia GPUs, with an observed performance gain of 2 to
                 20.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Matheou:2017:DDC,
  author =       "George Matheou and Paraskevas Evripidou",
  title =        "Data-Driven Concurrency for High Performance
                 Computing",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162014",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this work, we utilize dynamic dataflow/data-driven
                 techniques to improve the performance of high
                 performance computing (HPC) systems. The proposed
                 techniques are implemented and evaluated through an
                 efficient, portable, and robust programming framework
                 that enables data-driven concurrency on HPC systems.
                 The proposed framework is based on data-driven
                 multithreading (DDM), a hybrid control-flow/dataflow
                 model that schedules threads based on data availability
                 on sequential processors. The proposed framework was
                 evaluated using several benchmarks, with different
                 characteristics, on two different systems: a 4-node AMD
                 system with a total of 128 cores and a 64-node Intel
                 HPC system with a total of 768 cores. The performance
                 evaluation shows that the proposed framework scales
                 well and tolerates scheduling overheads and memory
                 latencies effectively. We also compare our framework to
                 MPI, DDM-VM, and OmpSs@Cluster. The comparison results
                 show that the proposed framework obtains comparable or
                 better performance.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Meister:2017:PME,
  author =       "Oliver Meister and Kaveh Rahnema and Michael Bader",
  title =        "Parallel Memory-Efficient Adaptive Mesh Refinement on
                 Structured Triangular Meshes with Billions of Grid
                 Cells",
  journal =      j-TOMS,
  volume =       "43",
  number =       "3",
  pages =        "19:1--19:27",
  month =        jan,
  year =         "2017",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2947668",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Oct 4 10:55:07 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=2947668",
  abstract =     "We present sam(oa) 2, a software package for a
                 dynamically adaptive, parallel solution of 2D partial
                 differential equations on triangular grids created via
                 newest vertex bisection. An element order imposed by
                 the Sierpinski space-filling curve provides an
                 algorithm for grid generation, refinement, and
                 traversal that is inherently memory efficient. Based
                 purely on stack and stream data structures, it
                 completely avoids random memory access. Using an
                 element-oriented data view suitable for local
                 operators, concrete simulation scenarios are
                 implemented based on control loops and event hooks,
                 which hide the complexity of the underlying traversal
                 scheme. Two case studies are presented: two-phase flow
                 in heterogeneous porous media and tsunami wave
                 propagation, demonstrated on the Tohoku tsunami 2011 in
                 Japan. sam(oa) 2 features hybrid MPI+OpenMP
                 parallelization based on the Sierpinski order induced
                 on the elements. Sections defined by contiguous grid
                 cells define atomic tasks for OpenMP work sharing and
                 stealing, as well as for migration of grid cells
                 between MPI processes. Using optimized communication
                 and load balancing algorithms, sam(oa) 2 achieves 88\%
                 strong scaling efficiency from 16 to 512 cores and 92\%
                 efficiency in a weak scaling test on 8,192 cores with
                 10 billion elements-all tests including adaptive mesh
                 refinement and load balancing in each time step.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Mendonca:2017:DAA,
  author =       "Gleison Mendon{\c{c}}a and Breno Guimar{\~a}es and
                 P{\'e}ricles Alves and M{\'a}rcio Pereira and Guido
                 Ara{\'u}jo and Fernando Magno Quint{\~a}o Pereira",
  title =        "{DawnCC}: Automatic Annotation for Data Parallelism
                 and Offloading",
  journal =      j-TACO,
  volume =       "14",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3084540",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:59 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Directive-based programming models, such as OpenACC
                 and OpenMP, allow developers to convert a sequential
                 program into a parallel one with minimum human
                 intervention. However, inserting pragmas into
                 production code is a difficult and error-prone task,
                 often requiring familiarity with the target program.
                 This difficulty restricts the ability of developers to
                 annotate code that they have not written themselves.
                 This article provides a suite of compiler-related
                 methods to mitigate this problem. Such techniques rely
                 on symbolic range analysis, a well-known static
                 technique, to achieve two purposes: populate source
                 code with data transfer primitives and to disambiguate
                 pointers that could hinder automatic parallelization
                 due to aliasing. We have materialized our ideas into a
                 tool, DawnCC, which can be used stand-alone or through
                 an online interface. To demonstrate its effectiveness,
                 we show how DawnCC can annotate the programs available
                 in PolyBench without any intervention from users. Such
                 annotations lead to speedups of over $ 100 \times $ in
                 an Nvidia architecture and over $ 50 \times $ in an ARM
                 architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Montella:2017:VCB,
  author =       "Raffaele Montella and Giulio Giunta and Giuliano
                 Laccetti and Marco Lapegna and Carlo Palmieri and
                 Carmine Ferraro and Valentina Pelliccia and Cheol-Ho
                 Hong and Ivor Spence and Dimitrios S. Nikolopoulos",
  title =        "On the Virtualization of {CUDA} Based {GPU} Remoting
                 on {ARM} and x86 Machines in the {GVirtuS} Framework",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "5",
  pages =        "1142--1163",
  month =        oct,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0462-1",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Nov 18 09:27:28 MST 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/5;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Moreira:2017:FCR,
  author =       "Rubens E. A. Moreira and Sylvain Collange and Fernando
                 Magno Quint{\~a}o Pereira",
  title =        "Function Call Re-Vectorization",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "313--326",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018751",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming languages such as C for CUDA, OpenCL or
                 ISPC have contributed to increase the programmability
                 of SIMD accelerators and graphics processing units.
                 However, these languages still lack the flexibility
                 offered by low-level SIMD programming on explicit
                 vectors. To close this expressiveness gap while
                 preserving performance, this paper introduces the
                 notion of \ourinvention{} (CREV). CREV allows changing
                 the dimension of vectorization during the execution of
                 a kernel, exposing it as a nested parallel kernel call.
                 CREV affords programmability close to dynamic
                 parallelism, a feature that allows the invocation of
                 kernels from inside kernels, but at much lower cost. In
                 this paper, we present a formal semantics of CREV, and
                 an implementation of it on the ISPC compiler. We have
                 used CREV to implement some classic algorithms,
                 including string matching, depth first search and
                 Bellman-Ford, with minimum effort. These algorithms,
                 once compiled by ISPC to Intel-based vector
                 instructions, are as fast as state-of-the-art
                 implementations, yet much simpler. Thus, CREV gives
                 developers the elegance of dynamic programming, and the
                 performance of explicit SIMD programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Mossaiby:2017:OIH,
  author =       "F. Mossaiby and A. Shojaei and M. Zaccariotto and U.
                 Galvanetto",
  title =        "{OpenCL} implementation of a high performance {$3$D}
                 Peridynamic model on graphics accelerators",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "74",
  number =       "8",
  pages =        "1856--1870",
  day =          "15",
  month =        oct,
  year =         "2017",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Sat Jan 13 11:04:24 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122117304030",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Neugebauer:2017:PAR,
  author =       "Olaf Neugebauer and Michael Engel and Peter Marwedel",
  title =        "A parallelization approach for resource-restricted
                 embedded heterogeneous {MPSoCs} inspired by {OpenMP}",
  journal =      j-J-SYST-SOFTW,
  volume =       "125",
  number =       "??",
  pages =        "439--448",
  month =        mar,
  year =         "2017",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Sat Feb 4 12:20:39 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "//www.sciencedirect.com/science/article/pii/S0164121216301534",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Systems and Software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212/",
}

@Article{Nguyen:2017:ATM,
  author =       "Tan Nguyen and Pietro Cicotti and Eric Bylaska and Dan
                 Quinlan and Scott Baden",
  title =        "Automatic translation of {MPI} source into a
                 latency-tolerant, data-driven form",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "106",
  number =       "??",
  pages =        "1--13",
  month =        aug,
  year =         "2017",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Aug 19 13:10:31 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731517300771",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Omar:2017:PSF,
  author =       "Cyrus Omar and Jonathan Aldrich",
  title =        "Programmable semantic fragments: the design and
                 implementation of {\tt typy}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "81--92",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993245",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces typy, a statically typed
                 programming language embedded by reflection into
                 Python. typy features a fragmentary semantics, i.e. it
                 delegates semantic control over each term, drawn from
                 Python's fixed concrete and abstract syntax, to some
                 contextually relevant user-defined semantic fragment.
                 The delegated fragment programmatically (1) typechecks
                 the term (following a bidirectional protocol); and (2)
                 assigns dynamic meaning to the term by computing a
                 translation to Python. We argue that this design is
                 expressive with examples of fragments that express the
                 static and dynamic semantics of (1) functional records;
                 (2) labeled sums (with nested pattern matching a la
                 ML); (3) a variation on JavaScript's prototypal object
                 system; and (4) typed foreign interfaces to Python and
                 OpenCL. These semantic structures are, or would need to
                 be, defined primitively in conventionally structured
                 languages. We further argue that this design is
                 compositionally well-behaved. It avoids the expression
                 problem and the problems of grammar composition because
                 the syntax is fixed. Moreover, programs are
                 semantically stable under fragment composition (i.e.
                 defining a new fragment will not change the meaning of
                 existing program components.)",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Pereira:2017:SBC,
  author =       "Phillipe Pereira and Higo Albuquerque and Isabela da
                 Silva and Hendrio Marques and Felipe Monteiro and
                 Ricardo Ferreira and Lucas Cordeiro",
  title =        "{SMT}-based context-bounded model checking for {CUDA}
                 programs",
  journal =      j-CCPE,
  volume =       "29",
  number =       "22",
  pages =        "??--??",
  day =          "25",
  month =        nov,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3934",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Dec 30 09:11:59 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Qawasmeh:2017:PPR,
  author =       "Ahmad Qawasmeh and Maxime R. Hugues and Henri Calandra
                 and Barbara M. Chapman",
  title =        "Performance portability in reverse time migration and
                 seismic modelling via {OpenACC}",
  journal =      j-IJHPCA,
  volume =       "31",
  number =       "5",
  pages =        "422--440",
  month =        sep,
  year =         "2017",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Sat Jan 6 10:31:59 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Rathgeber:2017:FAF,
  author =       "Florian Rathgeber and David A. Ham and Lawrence
                 Mitchell and Michael Lange and Fabio Luporini and
                 Andrew T. T. Mcrae and Gheorghe-Teodor Bercea and
                 Graham R. Markall and Paul H. J. Kelly",
  title =        "{Firedrake}: Automating the Finite Element Method by
                 Composing Abstractions",
  journal =      j-TOMS,
  volume =       "43",
  number =       "3",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2017",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2998441",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Oct 4 10:55:07 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=2998441",
  abstract =     "Firedrake is a new tool for automating the numerical
                 solution of partial differential equations. Firedrake
                 adopts the domain-specific language for the finite
                 element method of the FEniCS project, but with a pure
                 Python runtime-only implementation centered on the
                 composition of several existing and new abstractions
                 for particular aspects of scientific computing. The
                 result is a more complete separation of concerns that
                 eases the incorporation of separate contributions from
                 computer scientists, numerical analysts, and
                 application specialists. These contributions may add
                 functionality or improve performance. Firedrake
                 benefits from automatically applying new optimizations.
                 This includes factorizing mixed function spaces,
                 transforming and vectorizing inner loops, and
                 intrinsically supporting block matrix operations.
                 Importantly, Firedrake presents a simple public API for
                 escaping the UFL abstraction. This allows users to
                 implement common operations that fall outside of pure
                 variational formulations, such as flux limiters.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Rejitha:2017:EPC,
  author =       "R. S. Rejitha and Shajulin Benedict and Suja A. Alex
                 and Shany Infanto",
  title =        "Energy prediction of {CUDA} application instances
                 using dynamic regression models",
  journal =      j-COMPUTING,
  volume =       "99",
  number =       "8",
  pages =        "765--790",
  month =        aug,
  year =         "2017",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-016-0534-5",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Fri Feb 9 14:54:09 MST 2018",
  bibsource =    "http://link.springer.com/journal/607/99/8;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Rizzardi:2017:ATS,
  author =       "Mariarosaria Rizzardi",
  title =        "{Algorithm 981}: {Talbot Suite DE}: Application of
                 Modified {Talbot}'s Method to Solve Differential
                 Problems",
  journal =      j-TOMS,
  volume =       "44",
  number =       "2",
  pages =        "18:1--18:23",
  month =        sep,
  year =         "2017",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3089248",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Sep 19 17:19:59 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "http://dl.acm.org/citation.cfm?id=3089248",
  abstract =     "In order to solve a differential problem, the Laplace
                 Transform method, when applicable, replaces the problem
                 with a simpler one; the solution is obtained by solving
                 the new problem and then by computing the inverse
                 Laplace Transform of this function. In a numerical
                 context, since the solution of the transformed problem
                 consists of a sequence of Laplace Transform samples,
                 most of the software for the numerical inversion cannot
                 be used since the transform, among parameters, must be
                 passed as a function. To fill this gap, we present
                 Talbot Suite DE, a C software collection for Laplace
                 Transform inversions, specifically designed for these
                 problems and based on Talbot's method. It contains both
                 sequential and parallel implementations; the latter is
                 accomplished by means of OpenMP. We also report some
                 performance results. Aimed at non-expert users, the
                 software is equipped with several examples and a User
                 Guide that includes the external documentation,
                 explains how to use all the sample code, and reports
                 its results about accuracy and efficiency. Some
                 examples are entirely in C and others combine different
                 programming languages (C/MATLAB, C/FORTRAN). The User
                 Guide also contains useful hints to avoid possible
                 errors issued during the compilation or execution of
                 mixed-language code.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Mathematical Software",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Russo:2017:MPG,
  author =       "Igor L. S. Russo and Heder S. Bernardino and Helio J.
                 C. Barbosa",
  title =        "A massively parallel Grammatical Evolution technique
                 with {OpenCL}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "109",
  number =       "??",
  pages =        "333--349",
  month =        nov,
  year =         "2017",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Aug 19 13:10:32 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151730206X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Sato:2017:NIT,
  author =       "Kento Sato and Dong H. Ahn and Ignacio Laguna and
                 Gregory L. Lee and Martin Schulz and Christopher M.
                 Chambreau",
  title =        "Noise Injection Techniques to Expose Subtle and
                 Unintended Message Races",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "89--101",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018767",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging intermittently occurring bugs within MPI
                 applications is challenging, and message races, a
                 condition in which two or more sends race to match with
                 a receive, are one of the common root causes. Many
                 debugging tools have been proposed to help programmers
                 resolve them, but their runtime interference perturbs
                 the timing such that subtle races often cannot be
                 reproduced with debugging tools. We present novel noise
                 injection techniques to expose message races even under
                 a tool's control. We first formalize this race problem
                 in the context of non-deterministic parallel
                 applications and use this analysis to determine an
                 effective noise-injection strategy to uncover them. We
                 codified these techniques in NINJA (Noise INJection
                 Agent) that exposes these races without modification to
                 the application. Our evaluations on synthetic cases as
                 well as a real-world bug in Hypre-2.10.1 show that
                 NINJA significantly helps expose races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Schardl:2017:TEF,
  author =       "Tao B. Schardl and William S. Moses and Charles E.
                 Leiserson",
  title =        "{Tapir}: Embedding Fork-Join Parallelism into {LLVM}'s
                 Intermediate Representation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "249--265",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018758",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper explores how fork-join parallelism, as
                 supported by concurrency platforms such as Cilk and
                 OpenMP, can be embedded into a compiler's intermediate
                 representation (IR). Mainstream compilers typically
                 treat parallel linguistic constructs as syntactic sugar
                 for function calls into a parallel runtime. These calls
                 prevent the compiler from performing optimizations
                 across parallel control constructs. Remedying this
                 situation is generally thought to require an extensive
                 reworking of compiler analyses and code transformations
                 to handle parallel semantics. Tapir is a compiler IR
                 that represents logically parallel tasks asymmetrically
                 in the program's control flow graph. Tapir allows the
                 compiler to optimize across parallel control constructs
                 with only minor changes to its existing analyses and
                 code transformations. To prototype Tapir in the LLVM
                 compiler, for example, we added or modified about 6000
                 lines of LLVM's 4-million-line codebase. Tapir enables
                 LLVM's existing compiler optimizations for serial code
                 --- including loop-invariant-code motion,
                 common-subexpression elimination, and tail-recursion
                 elimination --- to work with parallel control
                 constructs such as spawning and parallel loops. Tapir
                 also supports parallel optimizations such as loop
                 scheduling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Schmitt:2017:SCP,
  author =       "Felix Schmitt and Robert Dietrich and Guido
                 Juckeland",
  title =        "Scalable critical-path analysis and optimization
                 guidance for hybrid {MPI--CUDA} applications",
  journal =      j-IJHPCA,
  volume =       "31",
  number =       "6",
  pages =        "485--498",
  month =        nov,
  year =         "2017",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Sat Jan 6 10:31:59 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Sharma:2017:PDR,
  author =       "Prateek Sharma and David Irwin and Prashant Shenoy",
  title =        "Portfolio-driven Resource Management for Transient
                 Cloud Servers",
  journal =      j-POMACS,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3084442",
  ISSN =         "2476-1249",
  ISSN-L =       "2476-1249",
  bibdate =      "Fri Jun 16 09:11:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pomacs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://dl.acm.org/citation.cfm?id=3084442",
  abstract =     "Cloud providers have begun to offer their surplus
                 capacity in the form of low-cost transient servers,
                 which can be revoked unilaterally at any time. While
                 the low cost of transient servers makes them attractive
                 for a wide range of applications, such as data
                 processing and scientific computing, failures due to
                 server revocation can severely degrade application
                 performance. Since different transient server types
                 offer different cost and availability tradeoffs, we
                 present the notion of server portfolios that is based
                 on financial portfolio modeling. Server portfolios
                 enable construction of an 'optimal' mix of severs to
                 meet an application's sensitivity to cost and
                 revocation risk. We implement model-driven portfolios
                 in a system called ExoSphere, and show how diverse
                 applications can use portfolios and
                 application-specific policies to gracefully handle
                 transient servers. We show that ExoSphere enables
                 widely-used parallel applications such as Spark, MPI,
                 and BOINC to be made transiency-aware with modest
                 effort. Our experiments show that allowing the
                 applications to use suitable transiency-aware policies,
                 ExoSphere is able to achieve 80\% cost savings when
                 compared to on-demand servers and greatly reduces
                 revocation risk compared to existing approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Proceedings of the ACM on Measurement and Analysis of
                 Computing Systems (POMACS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J1567",
}

@Article{Silla:2017:BRG,
  author =       "Federico Silla and Sergio Iserte and Carlos Rea{\~n}o
                 and Javier Prades",
  title =        "On the benefits of the remote {GPU} virtualization
                 mechanism: The {rCUDA} case",
  journal =      j-CCPE,
  volume =       "29",
  number =       "13",
  pages =        "",
  day =          "10",
  month =        jul,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4072",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Jul 24 08:22:38 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Singh:2017:EER,
  author =       "Amit Kumar Singh and Alok Prakash and Karunakar Reddy
                 Basireddy and Geoff V. Merrett and Bashir M.
                 Al-Hashimi",
  title =        "Energy-Efficient Run-Time Mapping and Thread
                 Partitioning of Concurrent {OpenCL} Applications on
                 {CPU--GPU MPSoCs}",
  journal =      j-TECS,
  volume =       "16",
  number =       "5s",
  pages =        "147:1--147:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3126548",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Heterogeneous Multi-Processor Systems-on-Chips
                 (MPSoCs) containing CPU and GPU cores are typically
                 required to execute applications concurrently. However,
                 as will be shown in this paper, existing approaches are
                 not well suited for concurrent applications as they are
                 developed either by considering only a single
                 application or they do not exploit both CPU and GPU
                 cores at the same time. In this paper, we propose an
                 energy-efficient run-time mapping and thread
                 partitioning approach for executing concurrent OpenCL
                 applications on both GPU and GPU cores while satisfying
                 performance requirements. Depending upon the
                 performance requirements, for each concurrently
                 executing application, the mapping process finds the
                 appropriate number of CPU cores and operating
                 frequencies of CPU and GPU cores, and the partitioning
                 process identifies an efficient partitioning of the
                 applications' threads between CPU and GPU cores. We
                 validate the proposed approach experimentally on the
                 Odroid-XU3 hardware platform with various mixes of
                 applications from the Polybench benchmark suite.
                 Additionally, a case-study is performed with a
                 real-world application SLAMBench. Results show an
                 average energy saving of 32\% compared to existing
                 approaches while still satisfying the performance
                 requirements.",
  acknowledgement = ack-nhfb,
  articleno =    "147",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Sotomayor:2017:ACG,
  author =       "Rafael Sotomayor and Luis Miguel Sanchez and Javier
                 Garcia Blas and Javier Fernandez and J. Daniel Garcia",
  title =        "Automatic {CPU\slash GPU} Generation of
                 Multi-versioned {OpenCL} Kernels for {C++} Scientific
                 Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "2",
  pages =        "262--282",
  month =        apr,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0425-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Mon Mar 13 15:25:22 MDT 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/2;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-016-0425-6",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Steele:2017:UBP,
  author =       "Guy L. {Steele, Jr.} and Jean-Baptiste Tristan",
  title =        "Using Butterfly-Patterned Partial Sums to Draw from
                 Discrete Distributions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "341--355",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018757",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a SIMD technique for drawing values from
                 multiple discrete distributions, such as sampling from
                 the random variables of a mixture model, that avoids
                 computing a complete table of partial sums of the
                 relative probabilities. A table of alternate
                 (``butterfly-patterned'') form is faster to compute,
                 making better use of coalesced memory accesses; from
                 this table, complete partial sums are computed on the
                 fly during a binary search. Measurements using CUDA 7.5
                 on an NVIDIA Titan Black GPU show that this technique
                 makes an entire machine-learning application that uses
                 a Latent Dirichlet Allocation topic model with 1024
                 topics about about 13\% faster (when using
                 single-precision floating-point data) or about 35\%
                 faster (when using double-precision floating-point
                 data) than doing a straightforward matrix transposition
                 after using coalesced accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Szo:2017:PET,
  author =       "M{\'a}t{\'e} Sz{\H{o}}ke and Tam{\'a}s Istv{\'a}n
                 J{\'o}zsa and {\'A}d{\'a}m Kolesz{\'a}r and Irene
                 Moulitsas and L{\'a}szl{\'o} K{\"o}n{\"o}zsy",
  title =        "Performance Evaluation of a Two-Dimensional Lattice
                 {Boltzmann} Solver Using {CUDA} and {PGAS} {UPC} Based
                 Parallelisation",
  journal =      j-TOMS,
  volume =       "44",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2017",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3085590",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Oct 4 10:55:07 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3085590",
  abstract =     "The Unified Parallel C (UPC) language from the
                 Partitioned Global Address Space (PGAS) family unifies
                 the advantages of shared and local memory spaces and
                 offers a relatively straightforward code
                 parallelisation with the Central Processing Unit (CPU).
                 In contrast, the Computer Unified Device Architecture
                 (CUDA) development kit gives a tool to make use of the
                 Graphics Processing Unit (GPU). We provide a detailed
                 comparison between these novel techniques through the
                 parallelisation of a two-dimensional lattice Boltzmann
                 method based fluid flow solver. Our comparison between
                 the CUDA and UPC parallelisation takes into account the
                 required conceptual effort, the performance gain, and
                 the limitations of the approaches from the application
                 oriented developers' point of view. We demonstrated
                 that UPC led to competitive efficiency with the local
                 memory implementation. However, the performance of the
                 shared memory code fell behind our expectations, and we
                 concluded that the investigated UPC compilers could not
                 efficiently treat the shared memory space. The CUDA
                 implementation proved to be more complex compared to
                 the UPC approach mainly because of the complicated
                 memory structure of the graphics card which also makes
                 GPUs suitable for the parallelisation of the lattice
                 Boltzmann method.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Szoke:2017:PET,
  author =       "M{\'a}t{\'e} Sz{\H{o}}ke and Tam{\'a}s Istv{\'a}n
                 J{\'o}zsa and {\'A}d{\'a}m Kolesz{\'a}r and Irene
                 Moulitsas and L{\'a}szl{\'o} K{\"o}n{\"o}zsy",
  title =        "Performance Evaluation of a Two-Dimensional Lattice
                 {Boltzmann} Solver Using {CUDA} and {PGAS UPC} Based
                 Parallelisation",
  journal =      j-TOMS,
  volume =       "44",
  number =       "1",
  pages =        "8:1--8:22",
  month =        jul,
  year =         "2017",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3085590",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Fri Jul 14 16:39:28 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "The Unified Parallel C (UPC) language from the
                 Partitioned Global Address Space (PGAS) family unifies
                 the advantages of shared and local memory spaces and
                 offers a relatively straightforward code
                 parallelisation with the Central Processing Unit (CPU).
                 In contrast, the Computer Unified Device Architecture
                 (CUDA) development kit gives a tool to make use of the
                 Graphics Processing Unit (GPU). We provide a detailed
                 comparison between these novel techniques through the
                 parallelisation of a two-dimensional lattice Boltzmann
                 method based fluid flow solver. Our comparison between
                 the CUDA and UPC parallelisation takes into account the
                 required conceptual effort, the performance gain, and
                 the limitations of the approaches from the application
                 oriented developers point of view. We demonstrated that
                 UPC led to competitive efficiency with the local memory
                 implementation. However, the performance of the shared
                 memory code fell behind our expectations, and we
                 concluded that the investigated UPC compilers could not
                 efficiently treat the shared memory space. The CUDA
                 implementation proved to be more complex compared to
                 the UPC approach mainly because of the complicated
                 memory structure of the graphics card which also makes
                 GPUs suitable for the parallelisation of the lattice
                 Boltzmann method.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Takafuji:2017:CCC,
  author =       "Daisuke Takafuji and Koji Nakano and Yasuaki Ito and
                 Jacir Bordim",
  title =        "{C2CU}: a {CUDA--C} program generator for bulk
                 execution of a sequential algorithm",
  journal =      j-CCPE,
  volume =       "29",
  number =       "17",
  pages =        "",
  day =          "10",
  month =        sep,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4022",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Sep 4 17:02:00 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Taylor:2017:AOO,
  author =       "Ben Taylor and Vicent Sanz Marco and Zheng Wang",
  title =        "Adaptive optimization for {OpenCL} programs on
                 embedded heterogeneous systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "11--20",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous multi-core architectures consisting of
                 CPUs and GPUs are commonplace in today's embedded
                 systems. These architectures offer potential for energy
                 efficient computing if the application task is mapped
                 to the right core. Realizing such potential is
                 challenging due to the complex and evolving nature of
                 hardware and applications. This paper presents an
                 automatic approach to map OpenCL kernels onto
                 heterogeneous multi-cores for a given optimization
                 criterion --- whether it is faster runtime, lower
                 energy consumption or a trade-off between them. This is
                 achieved by developing a machine learning based
                 approach to predict which processor to use to run the
                 OpenCL kernel and the host program, and at what
                 frequency the processor should operate. Instead of
                 hand-tuning a model for each optimization metric, we
                 use machine learning to develop a unified framework
                 that first automatically learns the optimization
                 heuristic for each metric off-line, then uses the
                 learned knowledge to schedule OpenCL kernels at runtime
                 based on code and runtime information of the program.
                 We apply our approach to a set of representative OpenCL
                 benchmarks and evaluate it on an ARM big.LITTLE mobile
                 platform. Our approach achieves over 93\% of the
                 performance delivered by a perfect predictor.We obtain,
                 on average, 1.2x, 1.6x, and 1.8x improvement
                 respectively for runtime, energy consumption and the
                 energy delay product when compared to a comparative
                 heterogeneous-aware OpenCL task mapping scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Utterback:2017:POR,
  author =       "Robert Utterback and Kunal Agrawal and I-Ting Angelina
                 Lee and Milind Kulkarni",
  title =        "Processor-Oblivious Record and Replay",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "145--161",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Record-and-replay systems are useful tools for
                 debugging non-deterministic parallel programs by first
                 recording an execution and then replaying that
                 execution to produce the same access pattern. Existing
                 record-and-replay systems generally target thread-based
                 execution models, and record the behaviors and
                 interleavings of individual threads. Dynamic
                 multithreaded languages and libraries, such as the Cilk
                 family, OpenMP, TBB, etc., do not have a notion of
                 threads. Instead, these languages provide a
                 processor-oblivious model of programming, where
                 programs expose task-parallelism using high-level
                 constructs such as spawn/sync without regard to the
                 number of threads/cores available to run the program.
                 Thread-based record-and-replay would violate the
                 processor-oblivious nature of these programs, as they
                 incorporate the number of threads into the recorded
                 information, constraining the replayed execution to the
                 same number of threads. In this paper, we present a
                 processor-oblivious record-and-replay scheme for such
                 languages where record and replay can use different
                 number of processors and both are scheduled using work
                 stealing. We provide theoretical guarantees for our
                 record and replay scheme --- namely that record is
                 optimal for programs with one lock and replay is
                 near-optimal for all cases. In addition, we implemented
                 this scheme in the Cilk Plus runtime system and our
                 evaluation indicates that processor-obliviousness does
                 not cause substantial overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Book{vanderPas:2017:UON,
  author =       "Ruud van der Pas",
  title =        "Using {OpenMP} --- the next step: affinity,
                 accelerators, tasking, and {SIMD}",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  pages =        "xxi + 365",
  year =         "2017",
  ISBN =         "0-262-53478-9 (paperback)",
  ISBN-13 =      "978-0-262-53478-9 (paperback)",
  LCCN =         "QA76.642 .P427 2017",
  bibdate =      "Sat Oct 5 07:54:47 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Scientific and engineering computation",
  abstract =     "This book offers an up-to-date, practical tutorial on
                 advanced features in the widely used OpenMP parallel
                 programming model. Building on the previous volume,
                 Using OpenMP: Portable Shared Memory Parallel
                 Programming (MIT Press), this book goes beyond the
                 fundamentals to focus on what has been changed and
                 added to OpenMP since the 2.5 specifications. It
                 emphasizes four major and advanced areas: thread
                 affinity (keeping threads close to their data),
                 accelerators (special hardware to speed up certain
                 operations), tasking (to parallelize algorithms with a
                 less regular execution flow), and SIMD (hardware
                 assisted operations on vectors). As in the earlier
                 volume, the focus is on practical usage, with major new
                 features primarily introduced by example. Examples are
                 restricted to C and C++, but are straightforward enough
                 to be understood by Fortran programmers. After a brief
                 recap of OpenMP 2.5, the book reviews enhancements
                 introduced since 2.5. It then discusses in detail
                 tasking, a major functionality enhancement; Non-Uniform
                 Memory Access (NUMA) architectures, supported by
                 OpenMP; SIMD, or Single Instruction Multiple Data;
                 heterogeneous systems, a new parallel programming model
                 to offload computation to accelerators; and the
                 expected further development of OpenMP.",
  acknowledgement = ack-nhfb,
  subject =      "Parallel programming (Computer science); Application
                 program interfaces (Computer software); OpenMP
                 (Application program interface)",
  tableofcontents = "Intro \\
                 Contents \\
                 Series Foreword \\
                 Foreword \\
                 Preface \\
                 1 A Recap of OpenMP 2.5 \\
                 1.1 OpenMP Directives and Syntax \\
                 1.2 Creating a Parallel Program with OpenMP \\
                 1.2.1 The Parallel Region \\
                 1.2.2 The OpenMP Execution Model \\
                 1.2.3 The OpenMP Memory Model \\
                 1.3 The Worksharing Constructs \\
                 1.3.1 The Loop Construct \\
                 1.3.2 The Sections Construct \\
                 1.3.3 The Single Construct \\
                 1.3.4 The Fortran Workshare Construct \\
                 1.3.5 The Combined Worksharing Constructs \\
                 1.4 The Master Construct \\
                 1.5 Nested Parallelism \\
                 1.6 Synchronization Constructs \\
                 1.6.1 The Barrier Construct \\
                 1.6.2 The Critical Construct \\
                 1.6.3 The Atomic Construct \\
                 1.6.4 The Ordered Construct \\
                 1.7 The OpenMP 2.5 Environment Variables \\
                 1.8 The OpenMP 2.5 Runtime Functions \\
                 1.9 Internal Control Variables in OpenMP \\
                 1.10 Concluding Remarks \\
                 2 New Features in OpenMP \\
                 2.1 Enhancements to Existing Constructs \\
                 2.1.1 The Schedule Clause \\
                 2.1.2 The If Clause \\
                 2.1.3 The Collapse Clause \\
                 2.1.4 The Linear Clause \\
                 2.1.5 The Critical Construct \\
                 2.1.6 The Atomic Construct \\
                 2.2 New Environment Variables \\
                 2.3 New Runtime Functions \\
                 2.3.1 Runtime Functions for Thread Management, Thread
                 Scheduling, and Nested Parallelism \\
                 2.3.2 Runtime Functions for Tasking, Cancellation, and
                 Thread Affinity \\
                 2.3.3 Runtime Functions for Locking \\
                 2.3.4 Runtime Functions for Heterogeneous Systems \\
                 2.3.5 Usage Examples of the New Runtime Functions \\
                 2.4 New Functionality \\
                 2.4.1 Changed Ownership of Locks \\
                 2.4.2 Cancellation \\
                 2.4.3 User-Defined Reduction \\
                 2.4.4 The Doacross Loop \\
                 2.5 Concluding Remarks \\
                 3 Tasking \\
                 3.1 Hello Task \\
                 3.1.1 Parallelizing a Palindrome \\
                 3.1.2 Parallelizing a Sentence with a Palindrome \\
                 3.1.3 Closing Comments on the Palindrome Example \\
                 3.2 Using Tasks to Parallelize a Linked List \\
                 3.2.1 The Sequential Version of the Linked List Program
                 \\
                 3.2.2 The Parallel Version of the Linked List Program
                 \\
                 3.2.3 Closing Comments on the Linked List Example \\
                 3.3 Sorting Things Out with Tasks \\
                 3.3.1 The Sequential Quicksort Algorithm \\
                 3.3.2 The OpenMP Quicksort Algorithm \\
                 3.3.3 Fine-Tuning the OpenMP Quicksort Algorithm \\
                 3.3.4 Closing Comments on the OpenMP Quicksort
                 Algorithm \\
                 3.4 Overlapping I/O and Computations Using Tasks \\
                 3.4.1 Using Tasks and Task Dependences \\
                 3.4.2 Using the Taskloop Construct \\
                 3.4.3 Closing Comments on the Pipeline Example \\
                 3.5 The Data Environment with Tasks \\
                 3.6 What is a Task? \\
                 3.7 Task Creation, Synchronization, and Scheduling \\
                 3.8 The Taskloop Construct \\
                 3.9 Concluding Remarks \\
                 4 Thread Affinity \\
                 4.1 The Characteristics of a cc-NUMA Architecture \\
                 4.2 First Touch Data Placement \\
                 4.2.1 The Pros and Cons of First Touch Data Placement
                 \\
                 4.2.2 How to Exploit the First Touch Policy \\
                 4.3 The Need for Thread Affinity Support \\
                 4.4 The OpenMP Thread Affinity Philosophy \\
                 4.5 The OpenMP Places Concept \\
                 4.5.1 Defining OpenMP Places Using Sets with Numbers
                 \\
                 4.5.2 The OpenMP Place List \\
                 4.5.3 Defining OpenMP Places Using Abstract Names",
}

@Article{Vargas-Perez:2017:HMO,
  author =       "Sandino Vargas-Perez and Fahad Saeed",
  title =        "A Hybrid {MPI--OpenMP} Strategy to Speedup the
                 Compression of Big Next-Generation Sequencing
                 Datasets",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "28",
  number =       "10",
  pages =        "2760--2769",
  month =        oct,
  year =         "2017",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2017.2692782",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Oct 12 06:58:12 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2017/10/07895161-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Waidyasooriya:2017:OBF,
  author =       "Hasitha Muthumala Waidyasooriya and Yasuhiro Takei and
                 Shunsuke Tatsumi and Masanori Hariyama",
  title =        "{OpenCL}-Based {FPGA}-Platform for Stencil Computation
                 and Its Optimization Methodology",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "28",
  number =       "5",
  pages =        "1390--1402",
  month =        may,
  year =         "2017",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2016.2614981",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jun 15 05:46:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2017/05/07582502-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Wang:2017:CEG,
  author =       "Siqi Wang and Guanwen Zhong and Tulika Mitra",
  title =        "{CGPredict}: Embedded {GPU} Performance Estimation
                 from Single-Threaded Applications",
  journal =      j-TECS,
  volume =       "16",
  number =       "5s",
  pages =        "146:1--146:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3126546",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Heterogeneous multiprocessor system-on-chip
                 architectures are endowed with accelerators such as
                 embedded GPUs and FPGAs capable of general-purpose
                 computation. The application developers for such
                 platforms need to carefully choose the accelerator with
                 the maximum performance benefit. For a given
                 application, usually, the reference code is specified
                 in a high-level single-threaded programming language
                 such as C. The performance of an application kernel on
                 an accelerator is a complex interplay among the exposed
                 parallelism, the compiler, and the accelerator
                 architecture. Thus, determining the performance of a
                 kernel requires its redevelopment into each
                 accelerator-specific language, causing substantial
                 wastage of time and effort. To aid the developer in
                 this early design decision, we present an analytical
                 framework CGPredict to predict the performance of a
                 computational kernel on an embedded GPU architecture
                 from un-optimized, single-threaded C code. The
                 analytical approach provides insights on application
                 characteristics which suggest further
                 application-specific optimizations. The estimation
                 error is as low as 2.66\% (average 9\%) compared to the
                 performance of the same kernel written in native CUDA
                 code running on NVIDIA Kepler embedded GPU. This low
                 performance estimation error enables CGPredict to
                 provide an early design recommendation of the
                 accelerator starting from C code.",
  acknowledgement = ack-nhfb,
  articleno =    "146",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Weber:2017:MAL,
  author =       "Nicolas Weber and Michael Goesele",
  title =        "{MATOG}: Array Layout Auto-Tuning for {CUDA}",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "28:1--28:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106341",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Optimal code performance is (besides correctness and
                 accuracy) the most important objective in compute
                 intensive applications. In many of these applications,
                 Graphic Processing Units (GPUs) are used because of
                 their high amount of compute power. However, caused by
                 their massively parallel architecture, the code has to
                 be specifically adjusted to the underlying hardware to
                 achieve optimal performance and therefore has to be
                 reoptimized for each new generation. In reality, this
                 is usually not the case as productive code is normally
                 at least several years old and nobody has the time to
                 continuously adjust existing code to new hardware. In
                 recent years more and more approaches have emerged that
                 automatically tune the performance of applications
                 toward the underlying hardware. In this article, we
                 present the MATOG auto-tuner and its concepts. It
                 abstracts the array memory access in CUDA applications
                 and automatically optimizes the code according to the
                 used GPUs. MATOG only requires few profiling runs to
                 analyze even complex applications, while achieving
                 significant speedups over non-optimized code,
                 independent of the used GPU generation and without the
                 need to manually tune the code.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wickerson:2017:ACM,
  author =       "John Wickerson and Mark Batty and Tyler Sorensen and
                 George A. Constantinides",
  title =        "Automatically comparing memory consistency models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "190--204",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009838",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A memory consistency model (MCM) is the part of a
                 programming language or computer architecture
                 specification that defines which values can legally be
                 read from shared memory locations. Because MCMs take
                 into account various optimisations employed by
                 architectures and compilers, they are often complex and
                 counterintuitive, which makes them challenging to
                 design and to understand. We identify four tasks
                 involved in designing and understanding MCMs:
                 generating conformance tests, distinguishing two MCMs,
                 checking compiler optimisations, and checking compiler
                 mappings. We show that all four tasks are instances of
                 a general constraint-satisfaction problem to which the
                 solution is either a program or a pair of programs.
                 Although this problem is intractable for automatic
                 solvers when phrased over programs directly, we show
                 how to solve analogous constraints over program
                 executions, and then construct programs that satisfy
                 the original constraints. Our technique, which is
                 implemented in the Alloy modelling framework, is
                 illustrated on several software- and architecture-level
                 MCMs, both axiomatically and operationally defined. We
                 automatically recreate several known results, often in
                 a simpler form, including: distinctions between
                 variants of the C11 MCM; a failure of the `SC-DRF
                 guarantee' in an early C11 draft; that x86 is
                 `multi-copy atomic' and Power is not; bugs in common
                 C11 compiler optimisations; and bugs in a compiler
                 mapping from OpenCL to AMD-style GPUs. We also use our
                 technique to develop and validate a new MCM for NVIDIA
                 GPUs that supports a natural mapping from OpenCL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Winkler:2017:GSM,
  author =       "Daniel Winkler and Michael Meister and Massoud
                 Rezavand and Wolfgang Rauch",
  title =        "{gpuSPHASE} --- A shared memory caching implementation
                 for {$2$D} {SPH} using {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "213",
  number =       "??",
  pages =        "165--180",
  month =        apr,
  year =         "2017",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 4 08:00:23 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465516303666",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655/",
}

@Article{Yam-Uicab:2017:FHT,
  author =       "R. Yam-Uicab and J. L. Lopez-Martinez and J. A.
                 Trejo-Sanchez and H. Hidalgo-Silva and S.
                 Gonzalez-Segura",
  title =        "A fast {Hough} Transform algorithm for straight lines
                 detection in an image using {GPU} parallel computing
                 with {CUDA-C}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "11",
  pages =        "4823--4842",
  month =        nov,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2051-5",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jan 6 08:59:18 MST 2018",
  bibsource =    "http://link.springer.com/journal/11227/73/11;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{YarKhan:2017:PPN,
  author =       "Asim YarKhan and Jakub Kurzak and Piotr Luszczek and
                 Jack Dongarra",
  title =        "Porting the {PLASMA} Numerical Library to the {OpenMP}
                 Standard",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "3",
  pages =        "612--633",
  month =        jun,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0441-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jun 24 11:37:59 MDT 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/3;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Yeh:2017:PFG,
  author =       "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
                 Rudolf Eigenmann and Timothy G. Rogers",
  title =        "{Pagoda}: Fine-Grained {GPU} Resource Virtualization
                 for Narrow Tasks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "221--234",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Massively multithreaded GPUs achieve high throughput
                 by running thousands of threads in parallel. To fully
                 utilize the hardware, workloads spawn work to the GPU
                 in bulk by launching large tasks, where each task is a
                 kernel that contains thousands of threads that occupy
                 the entire GPU. GPUs face severe underutilization and
                 their performance benefits vanish if the tasks are
                 narrow, i.e., they contain {$<$} 500 threads.
                 Latency-sensitive applications in network, signal, and
                 image processing that generate a large number of tasks
                 with relatively small inputs are examples of such
                 limited parallelism. This paper presents Pagoda, a
                 runtime system that virtualizes GPU resources, using an
                 OS-like daemon kernel called MasterKernel. Tasks are
                 spawned from the CPU onto Pagoda as they become
                 available, and are scheduled by the MasterKernel at the
                 warp granularity. Experimental results demonstrate that
                 Pagoda achieves a geometric mean speedup of 5.70x over
                 PThreads running on a 20-core CPU, 1.51x over
                 CUDA-HyperQ, and 1.69x over GeMTC, the state-of-
                 the-art runtime GPU task scheduling system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Young-S:2017:OGI,
  author =       "Luis E. Young-S. and Paulsamy Muruganandam and Sadhan
                 K. Adhikari and Vladimir Loncar and Dusan
                 Vudragovi{\'c} and Antun Balaz",
  title =        "{OpenMP} {GNU} and {Intel} {Fortran} programs for
                 solving the time-dependent {Gross--Pitaevskii}
                 equation",
  journal =      j-COMP-PHYS-COMM,
  volume =       "220",
  number =       "??",
  pages =        "503--506",
  month =        nov,
  year =         "2017",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Sep 15 11:56:42 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517302321",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Zha:2017:IFM,
  author =       "Yue Zha and Jing Li",
  title =        "{IMEC}: A Fully Morphable In-Memory Computing Fabric
                 Enabled by Resistive Crossbar",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "123--126",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2672558",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:01:23 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "In this paper, we propose a fully morphable In-MEmory
                 Computing (IMEC) fabric to better implement the concept
                 of processing inside memory (PIM). Enabled by emerging
                 nonvolatile memory, i.e., RRAM and its monolithic 3D
                 integration, IMEC can be configured into one or a
                 combination of four distinct functions, (1) logic, (2)
                 ternary content addressable memory, (3) memory, and (4)
                 interconnect. Thus, IMEC exploits a continuum of PIM
                 capabilities across the whole spectrum, ranging from 0
                 percent (pure data storage) to 100 percent (pure
                 compute engine), or intermediate states in between.
                 IMEC can be modularly integrated into the DDRx memory
                 subsystem, communicating with processors by the
                 ordinary DRAM commands. Additionally, to reduce the
                 programming burden, we provide a complete framework to
                 compile applications written in high-level programming
                 language (e.g., OpenCL) onto IMEC. This framework also
                 enables code portability across different platforms for
                 heterogeneous computing. By using this framework,
                 several benchmarks are mapped onto IMEC for evaluating
                 its performance, energy and resource utilization. The
                 simulation results show that, IMEC reduces the energy
                 consumption by 99.6 percent, and achieves 644x speedup,
                 compared to a baseline CPU system. We further compare
                 IMEC with FPGA architecture, and demonstrate that the
                 performance improvement is not simply obtained by
                 replacing SRAM cells with denser RRAM cells.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp
                 Engn Dept, Madison, WI 53706 USA. Zha, Yue; Li, Jing,
                 Univ Wisconsin, Elect \& Comp Engn Dept, Madison, WI
                 53706 USA.",
  author-email = "yzha3@wisc.edu jli587@wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "energy-efficiency computing; Non-volatile memory;
                 processing-in-memory; TCAM",
  keywords-plus = "ARCHITECTURE",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Zha:2017:IFM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2017:DLN,
  author =       "Jie Zhang and Xiaoyi Lu and Dhabaleswar K. (DK)
                 Panda",
  title =        "Designing Locality and {NUMA} Aware {MPI} Runtime for
                 Nested Virtualization based {HPC} Cloud with {SR--IOV}
                 Enabled {InfiniBand}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "187--200",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050765",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hypervisor-based virtualization solutions reveal good
                 security and isolation, while container-based solutions
                 make applications and workloads more portable and
                 distributed in an effective, standardized and
                 repeatable way. Therefore, nested virtualization based
                 computing environments (e.g., container over virtual
                 machine), which inherit the capabilities from both
                 solutions, are becoming more and more attractive in
                 clouds (e.g., running Docker over Amazon EC2 VMs).
                 Recent studies have shown that running applications in
                 either VMs or containers still has significant
                 overhead, especially for I/O intensive workloads. This
                 motivates us to investigate whether the nested
                 virtualization based solution can be adopted to build
                 high-performance computing (HPC) clouds for running MPI
                 applications efficiently and where the bottlenecks lie.
                 To eliminate performance bottlenecks, we propose a
                 high-performance two-layer locality and NUMA aware MPI
                 library, which is able to dynamically detect
                 co-resident containers inside one VM as well as detect
                 co-resident VM inside one host at MPI runtime. Thus the
                 MPI processes across different containers and VMs can
                 communicate to each other by shared memory or Cross
                 Memory Attach (CMA) channels instead of network channel
                 if they are co-resident. We further propose an enhanced
                 NUMA aware hybrid design to utilize InfiniBand loopback
                 based channel to optimize large message transfer across
                 containers when they are running on different sockets.
                 Performance evaluations show that compared with the
                 performance of the state-of-art (1Layer) design, our
                 proposed enhance-hybrid design can bring up to 184\%,
                 81\% and 12\% benefit on point-to-point, collective
                 operations, and end applications. Compared with the
                 default performance, our enhanced-hybrid design
                 delivers up to 184\%, 85\% and 16\% performance
                 improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Zhu:2017:OAP,
  author =       "Huming Zhu and Yanfei Wu and Pei Li and Peng Zhang and
                 Zhe Ji and Maoguo Gong",
  title =        "An {OpenCL}-accelerated parallel immunodominance clone
                 selection algorithm for feature selection",
  journal =      j-CCPE,
  volume =       "29",
  number =       "9",
  pages =        "",
  day =          "10",
  month =        may,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3838",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Jul 24 08:22:36 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Zouaoui:2017:CNG,
  author =       "Chakib Mustapha Anouar Zouaoui and Nasreddine Taleb",
  title =        "{CL\_ARRAY}: a new generic library of multidimensional
                 containers for {C++} compilers with extension for
                 {OpenCL} framework",
  journal =      j-COMP-LANGS-SYS-STRUCT,
  volume =       "50",
  number =       "??",
  pages =        "53--81",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  ISSN =         "1477-8424 (print), 1873-6866 (electronic)",
  ISSN-L =       "1477-8424",
  bibdate =      "Fri Sep 15 11:36:13 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/complngs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S147784241630135X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Languages, Systems and Structures",
  journal-URL =  "http://www.sciencedirect.com/science/journal/14778424/",
}

@Article{AlKadi:2018:GPC,
  author =       "Muhammed {Al Kadi} and Benedikt Janssen and Jones Yudi
                 and Michael Huebner",
  title =        "General-Purpose Computing with Soft {GPUs} on
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3173548",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Using field-programmable gate arrays (FPGAs) as a
                 substrate to deploy soft graphics processing units
                 (GPUs) would enable offering the FPGA compute power in
                 a very flexible GPU-like tool flow.
                 Application-specific adaptations like selective
                 hardening of floating-point operations and instruction
                 set subsetting would mitigate the high area and power
                 demands of soft GPUs. This work explores the
                 capabilities and limitations of soft General Purpose
                 Computing on GPUs (GPGPU) for both fixed- and floating
                 point arithmetic. For this purpose, we have developed
                 FGPU: a configurable, scalable, and portable GPU
                 architecture designed especially for FPGAs. FGPU is
                 open-source and implemented entirely in RTL. It can be
                 programmed in OpenCL and controlled through a Python
                 API. This article introduces its hardware architecture
                 as well as its tool flow. We evaluated the proposed
                 GPGPU approach against multiple other solutions. In
                 comparison to homogeneous Multi-Processor
                 System-On-Chips (MPSoCs), we found that using a soft
                 GPU is a Pareto-optimal solution regarding throughput
                 per area and energy consumption. On average, FGPU has a
                 2.9$ \times $ better compute density and 11.2$ \times $
                 less energy consumption than a single MicroBlaze
                 processor when computing in IEEE-754 floating-point
                 format. An average speedup of about 4$ \times $ over
                 the ARM Cortex-A9 supported with the NEON vector
                 co-processor has been measured for fixed- or
                 floating-point benchmarks. In addition, the biggest
                 FGPU cores we could implement on a Xilinx Zynq-7000
                 System-On-Chip (SoC) can deliver similar performance to
                 equivalent implementations with High-Level Synthesis
                 (HLS).",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Amer:2018:LCM,
  author =       "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and
                 Milind Chabbi and Yanjie Wei and Jeff Hammond and
                 Satoshi Matsuoka",
  title =        "Lock Contention Management in Multithreaded {MPI}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3275443",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3275443",
  abstract =     "In this article, we investigate contention management
                 in lock-based thread-safe MPI libraries. Specifically,
                 we make two assumptions: (1) locks are the only form of
                 synchronization when protecting communication paths;
                 and (2) contention occurs, and thus serialization is
                 unavoidable. Our work distinguishes between lock
                 acquisitions with respect to work being performed
                 inside a critical section; productive vs. unproductive.
                 Waiting for message reception without doing anything
                 else inside a critical section is an example of
                 unproductive lock acquisition. We show that the
                 high-throughput nature of modern scalable locking
                 protocols translates into better communication progress
                 for throughput-intensive MPI communication but
                 negatively impacts latency-sensitive communication
                 because of overzealous unproductive lock acquisition.
                 To reduce unproductive lock acquisitions, we devised a
                 method that promotes threads with productive work using
                 a generic two-level priority locking protocol. Our
                 results show that using a high-throughput protocol for
                 productive work and a fair protocol for less productive
                 code paths ensures the best tradeoff for fine-grained
                 communication, whereas a fair protocol is sufficient
                 for more coarse-grained communication. Although these
                 efforts have been rewarding, scalability degradation
                 remains significant. We discuss techniques that diverge
                 from the pure locking model and offer the potential to
                 further improve scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Arif:2018:RBP,
  author =       "Mahwish Arif and Hans Vandierendonck",
  title =        "Reducing the burden of parallel loop schedulers for
                 many-core processors",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "383--384",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This work proposes a low-overhead half-barrier pattern
                 to schedule fine-grain parallel loops and considers its
                 integration in the Intel OpenMP and Cilkplus
                 schedulers. Experimental evaluation demonstrates that
                 the scheduling overhead of our techniques is 43\% lower
                 than Intel OpenMP and 12.1x lower than Cilk. We observe
                 22\% speedup on 48 threads, with a peak of 2.8x
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Aydin:2018:RTP,
  author =       "Semra Aydin and Refik Samet and Omer Faruk Bay",
  title =        "Real-time parallel image processing applications on
                 multicore {CPUs} with {OpenMP} and {GPGPU} with
                 {CUDA}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "74",
  number =       "6",
  pages =        "2255--2275",
  month =        jun,
  year =         "2018",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2168-6",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:12 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/74/6;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Azimi:2018:SVS,
  author =       "Reza Azimi and Tyler Fox and Wendy Gonzalez and
                 Sherief Reda",
  title =        "Scale-Out vs Scale-Up: A Study of {ARM}-based {SoCs}
                 on Server-Class Workloads",
  journal =      j-TOMPECS,
  volume =       "3",
  number =       "4",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3232162",
  ISSN =         "2376-3639",
  bibdate =      "Sat Sep 21 07:21:16 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tompecs.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3232162",
  abstract =     "ARM 64-bit processing has generated enthusiasm to
                 develop ARM-based servers that are targeted for both
                 data centers and supercomputers. In addition to the
                 server-class components and hardware advancements, the
                 ARM software environment has grown substantially over
                 the past decade. Major development ecosystems and
                 libraries have been ported and optimized to run on ARM,
                 making ARM suitable for server-class workloads. There
                 are two trends in available ARM SoCs: mobile-class ARM
                 SoCs that rely on the heterogeneous integration of a
                 mix of CPU cores, GPGPU streaming multiprocessors
                 (SMs), and other accelerators, and the server-class
                 SoCs that instead rely on integrating a larger number
                 of CPU cores with no GPGPU support and a number of IO
                 accelerators. For scaling the number of processing
                 cores, there are two different paradigms: mobile-class
                 SoCs that use scale-out architecture in the form of a
                 cluster of simpler systems connected over a network,
                 and server-class ARM SoCs that use the scale-up
                 solution and leverage symmetric multiprocessing to pack
                 a large number of cores on the chip. In this article,
                 we present ScaleSoC cluster, which is a scale-out
                 solution based on mobile class ARM SoCs. ScaleSoC
                 leverages fast network connectivity and GPGPU
                 acceleration to improve performance and energy
                 efficiency compared to previous ARM scale-out clusters.
                 We consider a wide range of modern server-class
                 parallel workloads to study both scaling paradigms,
                 including latency-sensitive transactional workloads,
                 MPI-based CPU and GPGPU-accelerated scientific
                 applications, and emerging artificial intelligence
                 workloads. We study the performance and energy
                 efficiency of ScaleSoC compared to server-class ARM
                 SoCs and discrete GPGPUs in depth. We quantify the
                 network overhead on the performance of ScaleSoC and
                 show that packing a large number of ARM cores on a
                 single chip does not necessarily guarantee better
                 performance, due to the fact that shared resources,
                 such as last-level cache, become performance
                 bottlenecks. We characterize the GPGPU accelerated
                 workloads and demonstrate that for applications that
                 can leverage the better CPU-GPGPU balance of the
                 ScaleSoC cluster, performance and energy efficiency
                 improve compared to discrete GPGPUs.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Modeling and Performance
                 Evaluation of Computing Systems (TOMPECS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J1525",
}

@Article{Bazow:2018:MPS,
  author =       "Dennis Bazow and Ulrich Heinz and Michael Strickland",
  title =        "Massively parallel simulations of relativistic fluid
                 dynamics on graphics processing units with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "225",
  number =       "??",
  pages =        "92--113",
  month =        apr,
  year =         "2018",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2017.01.015",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Feb 28 14:39:27 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465517300279",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Belviranli:2018:JDA,
  author =       "Mehmet E. Belviranli and Seyong Lee and Jeffrey S.
                 Vetter and Laxmi N. Bhuyan",
  title =        "{Juggler}: a dependence-aware task-based execution
                 framework for {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "54--67",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178492",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scientific applications with single instruction,
                 multiple data (SIMD) computations show considerable
                 performance improvements when run on today's graphics
                 processing units (GPUs). However, the existence of data
                 dependences across thread blocks may significantly
                 impact the speedup by requiring global synchronization
                 across multiprocessors (SMs) inside the GPU. To
                 efficiently run applications with interblock data
                 dependences, we need fine-granular task-based execution
                 models that will treat SMs inside a GPU as stand-alone
                 parallel processing units. Such a scheme will enable
                 faster execution by utilizing all internal computation
                 elements inside the GPU and eliminating unnecessary
                 waits during device-wide global barriers. In this
                 paper, we propose Juggler, a task-based execution
                 scheme for GPU workloads with data dependences. The
                 Juggler framework takes applications embedding OpenMP
                 4.5 tasks as input and executes them on the GPU via an
                 efficient in-device runtime, hence eliminating the need
                 for kernel-wide global synchronization. Juggler
                 requires no or little modification to the source code,
                 and once launched, the runtime entirely runs on the GPU
                 without relying on the host through the entire
                 execution. We have evaluated Juggler on an NVIDIA Tesla
                 P100 GPU and obtained up to 31\% performance
                 improvement against global barrier based
                 implementation, with minimal runtime overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Benedict:2018:SES,
  author =       "Shajulin Benedict",
  title =        "{SCALE-EA}: A Scalability Aware Performance Tuning
                 Framework for {OpenMP} Applications",
  journal =      j-SCPE,
  volume =       "19",
  number =       "1",
  pages =        "15--30",
  month =        "????",
  year =         "2018",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:50 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1390",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Burtscher:2018:HQF,
  author =       "Martin Burtscher and Sindhu Devale and Sahar Azimi and
                 Jayadharini Jaiganesh and Evan Powers",
  title =        "A High-Quality and Fast Maximal Independent Set
                 Implementation for {GPUs}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291525",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Computing a maximal independent set is an important
                 step in many parallel graph algorithms. This article
                 introduces ECL-MIS, a maximal independent set
                 implementation that works well on GPUs. It includes key
                 optimizations to speed up computation, reduce the
                 memory footprint, and increase the set size. Its CUDA
                 implementation requires fewer than 30 kernel
                 statements, runs asynchronously, and produces a
                 deterministic result. It outperforms the maximal
                 independent set implementations of Pannotia, CUSP, and
                 IrGL on each of the 16 tested graphs of various types
                 and sizes. On a Titan X GPU, ECL-MIS is between 3.9 and
                 100 times faster (11.5 times, on average). ECL-MIS
                 running on the GPU is also faster than the parallel CPU
                 codes Ligra, Ligra+, and PBBS running on 20 Xeon cores,
                 which it outperforms by 4.1 times, on average. At the
                 same time, ECL-MIS produces maximal independent sets
                 that are up to 52\% larger (over 10\%, on average)
                 compared to these preexisting CPU and GPU
                 implementations. Whereas these codes produce maximal
                 independent sets that are, on average, about 15\%
                 smaller than the largest possible such sets, ECL-MIS
                 sets are less than 6\% smaller than the maximum
                 independent sets.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bylina:2018:EEO,
  author =       "Beata Bylina and Jaroslaw Bylina",
  title =        "An Experimental Evaluation of the {OpenMP} Thread
                 Mapping for {LU} Factorisation on {Xeon Phi}
                 Coprocessor and on Hybrid {CPU-MIC} Platform",
  journal =      j-SCPE,
  volume =       "19",
  number =       "3",
  pages =        "259--274",
  month =        "????",
  year =         "2018",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:50 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1373",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Castello:2018:EIR,
  author =       "Adri{\'a}n Castell{\'o} and Antonio J. Pe{\~n}a and
                 Rafael Mayo and Judit Planas and Enrique S.
                 Quintana-Ort{\'{\i}} and Pavan Balaji",
  title =        "Exploring the interoperability of remote {GPGPU}
                 virtualization using {rCUDA} and directive-based
                 programming models",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "74",
  number =       "11",
  pages =        "5628--5642",
  month =        nov,
  year =         "2018",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1791-y",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:09 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/74/11;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Chen:2018:FOB,
  author =       "Cen Chen and Kenli Li and Aijia Ouyang and Keqin Li",
  title =        "{FlinkCL}: An {OpenCL}-Based In-Memory Computing
                 Architecture on Heterogeneous {CPU--GPU} Clusters for
                 Big Data",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "67",
  number =       "12",
  pages =        "1765--1779",
  month =        "????",
  year =         "2018",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2018.2839719",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Nov 8 07:18:03 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://ieeexplore.ieee.org/document/8362980/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Clay:2018:GAP,
  author =       "M. P. Clay and D. Buaria and P. K. Yeung and T.
                 Gotoh",
  title =        "{GPU} acceleration of a petascale application for
                 turbulent mixing at high {Schmidt} number using {OpenMP
                 4.5}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "228",
  number =       "??",
  pages =        "100--114",
  month =        jul,
  year =         "2018",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.02.020",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu May 31 14:21:46 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518300596",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Cowles:2018:ISB,
  author =       "Mary Kathryn Cowles and Stephen Bonett and Michael
                 Seedorff",
  title =        "Independent sampling for {Bayesian} normal conditional
                 autoregressive models with {OpenCL} acceleration",
  journal =      j-COMP-STAT,
  volume =       "33",
  number =       "1",
  pages =        "159--177",
  month =        mar,
  year =         "2018",
  CODEN =        "CSTAEB",
  DOI =          "https://doi.org/10.1007/s00180-017-0752-0",
  ISSN =         "0943-4062 (print), 1613-9658 (electronic)",
  ISSN-L =       "0943-4062",
  bibdate =      "Thu Jun 18 16:19:50 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compstat.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/article/10.1007/s00180-017-0752-0",
  acknowledgement = ack-nhfb,
  ajournal =     "Comp. Stat.",
  fjournal =     "Computational Statistics",
  journal-URL =  "http://link.springer.com/journal/180",
}

@Article{Davina:2018:MCP,
  author =       "A. Lamas Davi{\~n}a and J. E. Roman",
  title =        "{MPI-CUDA} parallel linear solvers for
                 block-tridiagonal matrices in the context of {SLEPc}'s
                 eigensolvers",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "74",
  number =       "??",
  pages =        "118--135",
  month =        "????",
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2017.11.006",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Apr 3 13:55:32 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819117301874",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dieguez:2018:SLP,
  author =       "Adri{\'a}n P{\'e}rez Di{\'e}guez and Margarita Amor
                 and Jacobo Lobeiras and Ram{\'o}n Doallo",
  title =        "Solving Large Problem Sizes of Index-Digit Algorithms
                 on {GPU}: {FFT} and Tridiagonal System Solvers",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "67",
  number =       "1",
  pages =        "86--101",
  month =        jan,
  year =         "2018",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2017.2723879",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Dec 14 07:11:27 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ieeexplore.ieee.org/document/7970194/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "Computer architecture; CUDA; CUSPARSE; FFT; GPU;
                 Graphics processing units; Instruction sets; Kernel;
                 medium problem sizes; Proposals; Signal processing
                 algorithms; Synchronization; tridiagonal systems;
                 tuning",
}

@Article{Eddelbuettel:2018:BRN,
  author =       "Dirk Eddelbuettel",
  title =        "Book Review: {Norman Matloff. \booktitle{Parallel
                 Computing for Data Science: With Examples in R, C++,
                 and CUDA}. Boca Raton: CRC Press}",
  journal =      j-BIOMETRICS,
  volume =       "74",
  number =       "2",
  pages =        "770--770",
  month =        jun,
  year =         "2018",
  CODEN =        "BIOMB6",
  DOI =          "https://doi.org/10.1111/biom.12896",
  ISSN =         "0006-341X (print), 1541-0420 (electronic)",
  ISSN-L =       "0006-341X",
  bibdate =      "Thu Jun 25 10:48:44 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/biometrics2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/s-plus.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Biometrics",
  fjournal =     "Biometrics",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1541-0420",
  onlinedate =   "26 June 2018",
}

@Article{Faraji:2018:DCG,
  author =       "Iman Faraji and Ahmad Afsahi",
  title =        "Design considerations for {GPU}-aware collective
                 communications in {MPI}",
  journal =      j-CCPE,
  volume =       "30",
  number =       "17",
  pages =        "e4667:1--e4667:??",
  day =          "10",
  month =        sep,
  year =         "2018",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4667",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Mar 28 08:07:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "18 May 2018",
}

@Article{Ferreira:2018:CMM,
  author =       "Kurt B. Ferreira and Scott Levy and Kevin Pedretti and
                 Ryan E. Grant",
  title =        "Characterizing {MPI} matching via trace-based
                 simulation",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "77",
  number =       "??",
  pages =        "57--83",
  month =        sep,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.05.005",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jan 7 15:25:20 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118301467",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gallardo:2018:EMM,
  author =       "Esthela Gallardo and J{\'e}r{\^o}me Vienne and
                 Leonardo Fialho and Patricia Teller and James Browne",
  title =        "Employing {MPI\_T} in {MPI} Advisor to optimize
                 application performance",
  journal =      j-IJHPCA,
  volume =       "32",
  number =       "6",
  pages =        "882--896",
  day =          "1",
  month =        nov,
  year =         "2018",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342016684005",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:52 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342016684005",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Gerbessiotis:2018:SIS,
  author =       "Alexandros V. Gerbessiotis",
  title =        "A Study of Integer Sorting on Multicores",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "28",
  number =       "04",
  pages =        "??--??",
  month =        dec,
  year =         "2018",
  DOI =          "https://doi.org/10.1142/S0129626418500147",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:05 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626418500147",
  abstract =     "Integer sorting on multicores and GPUs can be realized
                 by a variety of approaches that include variants of
                 distribution-based methods such as radix-sort,
                 comparison-oriented algorithms such as deterministic
                 regular sampling and random sampling parallel sorting,
                 and network-based algorithms such as Batcher's bitonic
                 sorting algorithm. In this work we present an
                 experimental study of integer sorting on multicore
                 processors. We have implemented serial and parallel
                 radix-sort for various radixes, deterministic regular
                 oversampling, and random oversampling parallel sorting,
                 including new variants of ours, and also some
                 previously little explored or unexplored variants of
                 bitonic-sort and odd-even transposition sort. The study
                 uses multithreading and multiprocessing parallel
                 programming libraries with the same C language code
                 working under Open MPI, MulticoreBSP, and BSPlib. We
                 first provide some general high-level observations on
                 the performance of these implementations. If we can
                 conclude anything is that accurate prediction of
                 performance by taking into consideration architecture
                 dependent features such as the structure and
                 characteristics of multiple memory hierarchies is
                 difficult and more often than not untenable. To some
                 degree this is affected by the overhead imposed by the
                 high-level library used in the programming effort.
                 Another objective is to model the performance of these
                 algorithms and their implementations under the MBSP
                 (Multi-memory BSP) model. Despite the limitations
                 mentioned above, we can still draw some reliable
                 conclusions and reason about the performance of these
                 implementations using the MBSP model, thus making MBSP
                 useful and usable.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Gerstenberger:2018:EHS,
  author =       "Robert Gerstenberger and Maciej Besta and Torsten
                 Hoefler",
  title =        "Enabling highly scalable remote memory access
                 programming with {MPI-3} one sided",
  journal =      j-CACM,
  volume =       "61",
  number =       "10",
  pages =        "106--113",
  month =        oct,
  year =         "2018",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/3264413",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Thu Sep 27 11:55:45 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://cacm.acm.org/magazines/2018/10/231375/fulltext",
  abstract =     "Modern high-performance networks offer remote direct
                 memory access (RDMA) that exposes a process' virtual
                 address space to other processes in the network. The
                 Message Passing Interface (MPI) specification has
                 recently been extended with a programming interface
                 called MPI-3 Remote Memory Access (MPI-3 RMA) for
                 efficiently exploiting state-of-the-art RDMA features.
                 MPI-3 RMA enables a powerful programming model that
                 alleviates many message passing downsides. In this
                 work, we design and develop bufferless protocols that
                 demonstrate how to implement this interface and support
                 scaling to millions of cores with negligible memory
                 consumption while providing highest performance and
                 minimal overheads. To arm programmers, we provide a
                 spectrum of performance models for RMA functions that
                 enable rigorous mathematical analysis of application
                 performance and facilitate the development of codes
                 that solve given tasks within specified time and energy
                 budgets. We validate the usability of our library and
                 models with several application studies with up to half
                 a million processes. In a wider sense, our work
                 illustrates how to use RMA principles to accelerate
                 computation- and data-intensive codes.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Gianinazzi:2018:CAP,
  author =       "Lukas Gianinazzi and Pavel Kalvoda and Alessandro {De
                 Palma} and Maciej Besta and Torsten Hoefler",
  title =        "Communication-avoiding parallel minimum cuts and
                 connected components",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "219--232",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178504",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present novel scalable parallel algorithms for
                 finding global minimum cuts and connected components,
                 which are important and fundamental problems in graph
                 processing. To take advantage of future massively
                 parallel architectures, our algorithms are
                 communication-avoiding: they reduce the costs of
                 communication across the network and the cache
                 hierarchy. The fundamental technique underlying our
                 work is the randomized sparsification of a graph:
                 removing a fraction of graph edges, deriving a solution
                 for such a sparsified graph, and using the result to
                 obtain a solution for the original input. We design and
                 implement sparsification with O (1) synchronization
                 steps. Our global minimum cut algorithm decreases
                 communication costs and computation compared to the
                 state-of-the-art, while our connected components
                 algorithm incurs few cache misses and synchronization
                 steps. We validate our approach by evaluating MPI
                 implementations of the algorithms on a petascale
                 supercomputer. We also provide an approximate variant
                 of the minimum cut algorithm and show that it
                 approximates the exact solutions well while using a
                 fraction of cores in a fraction of time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Goglin:2018:HTM,
  author =       "Brice Goglin and Emmanuel Jeannot and Farouk Mansouri
                 and Guillaume Mercier",
  title =        "Hardware topology management in {MPI} applications
                 through hierarchical communicators",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "76",
  number =       "??",
  pages =        "70--90",
  month =        aug,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.05.006",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jun 4 07:40:18 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118301480",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gomez-Folgar:2018:MPA,
  author =       "F. Gomez-Folgar and G. Indalecio and N. Seoane and T.
                 F. Pena and A. J. Garcia-Loureiro",
  title =        "{MPI-Performance-Aware-Reallocation}: method to
                 optimize the mapping of processes applied to a cloud
                 infrastructure",
  journal =      j-COMPUTING,
  volume =       "100",
  number =       "2",
  pages =        "211--226",
  month =        feb,
  year =         "2018",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-017-0573-6",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Wed Nov 7 08:19:16 MST 2018",
  bibsource =    "http://link.springer.com/journal/607/100/2;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Gonzalez-Dominguez:2018:MPC,
  author =       "Jorge Gonzalez-Dominguez and Maria J. Martin",
  title =        "{MPIGeneNet}: Parallel Calculation of Gene
                 Co-Expression Networks on Multicore Clusters",
  journal =      j-TCBB,
  volume =       "15",
  number =       "5",
  pages =        "1732--1737",
  month =        sep,
  year =         "2018",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2017.2761340",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Thu Nov 8 06:18:46 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "In this work, we present MPIGeneNet, a parallel tool
                 that applies Pearson's correlation and Random Matrix
                 Theory to construct gene co-expression networks. It is
                 based on the state-of-the-art sequential tool
                 RMTGeneNet, which provides networks with high
                 robustness and sensitivity at the expenses of
                 relatively long runtimes for large scale input
                 datasets. MPIGeneNet returns the same results as
                 RMTGeneNet but improves the memory management, reduces
                 the I/O cost, and accelerates the two most
                 computationally demanding steps of co-expression
                 network construction by exploiting the compute
                 capabilities of common multicore CPU clusters. Our
                 performance evaluation on two different systems using
                 three typical input datasets shows that MPIGeneNet is
                 significantly faster than RMTGeneNet. As an example,
                 our tool is up to 175.41 times faster on a cluster with
                 eight nodes, each one containing two 12-core Intel
                 Haswell processors. The source code of MPIGeneNet, as
                 well as a reference manual, are available at
                 https://sourceforge.net/projects/mpigenenet/.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Gupta:2018:ALQ,
  author =       "Sourendu Gupta and Pushan Majumdar",
  title =        "Accelerating lattice {QCD} simulations with 2 flavors
                 of staggered fermions on multiple {GPUs} using
                 {OpenACC} --- a first attempt",
  journal =      j-COMP-PHYS-COMM,
  volume =       "228",
  number =       "??",
  pages =        "44--53",
  month =        jul,
  year =         "2018",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.03.008",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Thu May 31 14:21:46 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518300808",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Halver:2018:FPM,
  author =       "Rene Halver and Wilhelm Homberg and Godehard Sutmann",
  title =        "Function portability of molecular dynamics on
                 heterogeneous parallel architectures with {OpenCL}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "74",
  number =       "4",
  pages =        "1522--1533",
  month =        apr,
  year =         "2018",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2232-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:11 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/74/4;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Huang:2018:ACO,
  author =       "Kai Huang and Biao Hu and Long Chen and Alois Knoll
                 and Zhihua Wang",
  title =        "{Adas} on {Cots} with {OpenCL}: A Case Study with Lane
                 Detection",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "67",
  number =       "4",
  pages =        "559--565",
  month =        "????",
  year =         "2018",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2017.2759203",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Mar 15 08:52:31 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ieeexplore.ieee.org/document/8057795/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Imbernon:2018:ELS,
  author =       "Baldomero Imbern{\'o}n and Javier Prades and Domingo
                 Gim{\'e}nez and Jos{\'e} M. Cecilia and Federico
                 Silla",
  title =        "Enhancing large-scale docking simulation on
                 heterogeneous systems: An {MPI} vs {rCUDA} study",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "79 (part 1)",
  number =       "??",
  pages =        "26--37",
  year =         "2018",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2017.08.050",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Nov 24 15:16:17 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.sciencedirect.com/science/article/pii/S0167739X17309974",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "Heterogeneous computing; HPC; Metaheuristics; rCUDA;
                 Virtual screening",
}

@Article{Jambunathan:2018:COB,
  author =       "Revathi Jambunathan and Deborah A. Levin",
  title =        "{CHAOS}: an octree-based {PIC--DSMC} code for modeling
                 of electron kinetic properties in a plasma plume using
                 {MPI--CUDA} parallelization",
  journal =      j-J-COMPUT-PHYS,
  volume =       "373",
  number =       "??",
  pages =        "571--604",
  day =          "15",
  month =        nov,
  year =         "2018",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1016/j.jcp.2018.07.005",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Thu Sep 20 17:02:49 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2015.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999118304601",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Kamburugamuve:2018:AML,
  author =       "Supun Kamburugamuve and Pulasthi Wickramasinghe and
                 Saliya Ekanayake and Geoffrey C. Fox",
  title =        "Anatomy of machine learning algorithm implementations
                 in {MPI}, {Spark}, and {Flink}",
  journal =      j-IJHPCA,
  volume =       "32",
  number =       "1",
  pages =        "61--73",
  month =        jan,
  year =         "2018",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Sat Jan 6 10:32:00 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Kang:2018:PRS,
  author =       "Zhijiang Kang and Ze Deng and Wei Han and Dongmei
                 Zhang",
  title =        "Parallel Reservoir Simulation with {OpenACC} and
                 Domain Decomposition",
  journal =      j-ALGORITHMS-BASEL,
  volume =       "11",
  number =       "12",
  month =        dec,
  year =         "2018",
  CODEN =        "ALGOCH",
  DOI =          "https://doi.org/10.3390/a11120213",
  ISSN =         "1999-4893 (electronic)",
  ISSN-L =       "1999-4893",
  bibdate =      "Fri May 3 14:18:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/algorithms.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.mdpi.com/1999-4893/11/12/213",
  acknowledgement = ack-nhfb,
  articleno =    "??",
  fjournal =     "Algorithms (Basel)",
  journal-URL =  "https://www.mdpi.com/journal/algorithms",
  pagecount =    "??",
  pubdates =     "Received: 16 November 2018 / Revised: 5 December 2018
                 / Accepted: 14 December 2018 / Published: 18 December
                 2018",
}

@Article{Kono:2018:EOW,
  author =       "Fumiya Kono and Naohito Nakasato and Kensaku Hayashi
                 and Alexander Vazhenin and Stanislav Sedukhin",
  title =        "Evaluations of {OpenCL-written} tsunami simulation on
                 {FPGA} and comparison with {GPU} implementation",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "74",
  number =       "6",
  pages =        "2747--2775",
  month =        jun,
  year =         "2018",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-018-2315-8",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:12 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/74/6;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Kotsifakou:2018:HHP,
  author =       "Maria Kotsifakou and Prakalp Srivastava and Matthew D.
                 Sinclair and Rakesh Komuravelli and Vikram Adve and
                 Sarita Adve",
  title =        "{HPVM}: heterogeneous parallel virtual machine",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "68--80",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178493",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We propose a parallel program representation for
                 heterogeneous systems, designed to enable performance
                 portability across a wide range of popular parallel
                 hardware, including GPUs, vector instruction sets,
                 multicore CPUs and potentially FPGAs. Our
                 representation, which we call HPVM, is a hierarchical
                 dataflow graph with shared memory and vector
                 instructions. HPVM supports three important
                 capabilities for programming heterogeneous systems: a
                 compiler intermediate representation (IR), a virtual
                 instruction set (ISA), and a basis for runtime
                 scheduling; previous systems focus on only one of these
                 capabilities. As a compiler IR, HPVM aims to enable
                 effective code generation and optimization for
                 heterogeneous systems. As a virtual ISA, it can be used
                 to ship executable programs, in order to achieve both
                 functional portability and performance portability
                 across such systems. At runtime, HPVM enables flexible
                 scheduling policies, both through the graph structure
                 and the ability to compile individual nodes in a
                 program to any of the target devices on a system. We
                 have implemented a prototype HPVM system, defining the
                 HPVM IR as an extension of the LLVM compiler IR,
                 compiler optimizations that operate directly on HPVM
                 graphs, and code generators that translate the virtual
                 ISA to NVIDIA GPUs, Intel's AVX vector units, and to
                 multicore X86-64 processors. Experimental results show
                 that HPVM optimizations achieve significant performance
                 improvements, HPVM translators achieve performance
                 competitive with manually developed OpenCL code for
                 both GPUs and vector hardware, and that runtime
                 scheduling policies can make use of both program and
                 runtime information to exploit the flexible compilation
                 capabilities. Overall, we conclude that the HPVM
                 representation is a promising basis for achieving
                 performance portability and for implementing
                 parallelizing compilers for heterogeneous parallel
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Li:2018:CER,
  author =       "Xiangbo Li and Mohsen Amini Salehi and Magdy Bayoumi
                 and Nian-Feng Tzeng and Rajkumar Buyya",
  title =        "Cost-Efficient and Robust On-Demand Video Transcoding
                 Using Heterogeneous Cloud Services",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "29",
  number =       "3",
  pages =        "556--571",
  month =        "????",
  year =         "2018",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2017.2766069",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 15 06:03:25 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ieeexplore.ieee.org/document/8081853/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Li:2018:COM,
  author =       "Shigang Li and Yunquan Zhang and Torsten Hoefler",
  title =        "Cache-Oblivious {MPI} All-to-All Communications Based
                 on {Morton} Order",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "29",
  number =       "3",
  pages =        "542--555",
  month =        "????",
  year =         "2018",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2017.2768413",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 15 06:03:25 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://ieeexplore.ieee.org/document/8091010/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
}

@Article{Liang:2018:FMP,
  author =       "Yun Liang and Shuo Wang and Wei Zhang",
  title =        "{FlexCL}: A Model of Performance and Power for
                 {OpenCL} Workloads on {FPGAs}",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "67",
  number =       "12",
  pages =        "1750--1764",
  month =        "????",
  year =         "2018",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2018.2840686",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Nov 8 07:18:03 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://ieeexplore.ieee.org/document/8365849/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Lin:2018:CHM,
  author =       "Han Lin and Zhichao Su and Xiandong Meng and Xu Jin
                 and Zhong Wang and Wenting Han and Hong An and Mengxian
                 Chi and Zheng Wu",
  title =        "Combining {Hadoop} with {MPI} to Solve Metagenomics
                 Problems that are both Data- and Compute-intensive",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "46",
  number =       "4",
  pages =        "762--775",
  month =        aug,
  year =         "2018",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-017-0524-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Fri Oct 11 08:37:50 MDT 2019",
  bibsource =    "http://link.springer.com/journal/10766/46/4;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@InProceedings{Malakhov:2018:CMT,
  author =       "Anton Malakhov and David Liu and Anton Gorshkov and
                 Terry Wilmarth",
  editor =       "Fatih Akici and David Lippa and Dillon Niederhut and
                 M. Pacer",
  booktitle =    "Proceedings of the {17th Python in Science Conference,
                 Austin, TX, 9--15 July 2018}",
  title =        "Composable Multi-Threading and Multi-Processing for
                 Numeric Libraries",
  publisher =    "????",
  address =      "????",
  pages =        "15--21",
  year =         "2018",
  bibdate =      "Wed Aug 1 09:03:36 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib",
  URL =          "http://conference.scipy.org/proceedings/scipy2018/anton_malakhov.html",
  abstract =     "Python is popular among scientific communities that
                 value its simplicity and power, especially as it comes
                 along with numeric libraries such as NumPy, SciPy,
                 Dask, and Numba. As CPU core counts keep increasing,
                 these modules can make use of many cores via
                 multi-threading for efficient multi-core parallelism.
                 However, threads can interfere with each other leading
                 to overhead and inefficiency if used together in a
                 single application on machines with a large number of
                 cores. This performance loss can be prevented if all
                 multi-threaded modules are coordinated. This paper
                 continues the work started in AMala16 by introducing
                 more approaches to coordination for both
                 multi-threading and multi-processing cases. In
                 particular, we investigate the use of static settings,
                 limiting the number of simultaneously active OpenMP
                 parallel regions, and optional parallelism with Intel
                 Threading Building Blocks (Intel TBB). We will show how
                 these approaches help to unlock additional performance
                 for numeric applications on multi-core systems.",
  acknowledgement = ack-nhfb,
  keywords =     "Dask; GIL; Joblib; Multi-core; Multi-processing;
                 Multi-threading; Nested Parallelism; NumPy; OpenMP;
                 Oversubscription; Parallel Computations; Python; SciPy;
                 TBB",
}

@Article{Maleki:2018:AHP,
  author =       "Sepideh Maleki and Martin Burtscher",
  title =        "Automatic Hierarchical Parallelization of Linear
                 Recurrences",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "128--138",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173168",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Linear recurrences encompass many fundamental
                 computations including prefix sums and digital filters.
                 Later result values depend on earlier result values in
                 recurrences, making it a challenge to compute them in
                 parallel. We present a new work- and space-efficient
                 algorithm to compute linear recurrences that is
                 amenable to automatic parallelization and suitable for
                 hierarchical massively-parallel architectures such as
                 GPUs. We implemented our approach in a domain-specific
                 code generator that emits optimized CUDA code. Our
                 evaluation shows that, for standard prefix sums and
                 single-stage IIR filters, the generated code reaches
                 the throughput of memory copy for large inputs, which
                 cannot be surpassed. On higher-order prefix sums, it
                 performs nearly as well as the fastest handwritten code
                 from the literature. On tuple-based prefix sums and
                 digital filters, our automatically parallelized code
                 outperforms the fastest prior implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Malinowski:2018:SIP,
  author =       "Artur Malinowski and Pawel Czarnul",
  title =        "A Solution to Image Processing with Parallel {MPI}
                 {I/O} and Distributed {NVRAM} Cache",
  journal =      j-SCPE,
  volume =       "19",
  number =       "1",
  pages =        "1--14",
  month =        "????",
  year =         "2018",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:50 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1389",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Moll:2018:PCF,
  author =       "Simon Moll and Sebastian Hack",
  title =        "Partial control-flow linearization",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "543--556",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192413",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "If-conversion is a fundamental technique for
                 vectorization. It accounts for the fact that in a SIMD
                 program, several targets of a branch might be executed
                 because of divergence. Especially for irregular
                 data-parallel workloads, it is crucial to avoid
                 if-converting non-divergent branches to increase SIMD
                 utilization. In this paper, we present partial
                 linearization, a simple and efficient if-conversion
                 algorithm that overcomes several limitations of
                 existing if-conversion techniques. In contrast to prior
                 work, it has provable guarantees on which non-divergent
                 branches are retained and will never duplicate code or
                 insert additional branches. We show how our algorithm
                 can be used in a classic loop vectorizer as well as to
                 implement data-parallel languages such as ISPC or
                 OpenCL. Furthermore, we implement prior vectorizer
                 optimizations on top of partial linearization in a more
                 general way. We evaluate the implementation of our
                 algorithm in LLVM on a range of irregular data
                 analytics kernels, a neutronics simulation benchmark
                 and NAB, a molecular dynamics benchmark from SPEC2017
                 on AVX2, AVX512, and ARM Advanced SIMD machines and
                 report speedups of up to 146 \% over ICC, GCC and Clang
                 O3.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '18 proceedings.",
}

@Article{Monteiro:2018:EGC,
  author =       "Felipe R. Monteiro and Erickson H. da S. Alves and
                 Isabela S. Silva and Hussama I. Ismail and Lucas C.
                 Cordeiro and Eddie B. de Lima Filho",
  title =        "{ESBMC-GPU}: a context-bounded model checking tool to
                 verify {CUDA} programs",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "152",
  number =       "??",
  pages =        "63--69",
  day =          "15",
  month =        jan,
  year =         "2018",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Sat Dec 2 17:23:38 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167642317301934",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
}

@Article{Peng:2018:CDC,
  author =       "Yuanfeng Peng and Vinod Grover and Joseph Devietti",
  title =        "{CURD}: a dynamic {CUDA} race detector",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "390--403",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192368",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As GPUs have become an integral part of nearly every
                 processor, GPU programming has become increasingly
                 popular. GPU programming requires a combination of
                 extreme levels of parallelism and low-level
                 programming, making it easy for concurrency bugs such
                 as data races to arise. These concurrency bugs can be
                 extremely subtle and di cult to debug due to the
                 massive numbers of threads running concurrently on a
                 modern GPU. While some tools exist to detect data races
                 in GPU programs, they are often prohibitively slow or
                 focused only on a small class of data races in shared
                 memory. Compared to prior work, our race detector,
                 CURD, can detect data races precisely on both shared
                 and global memory, selects an appropriate race
                 detection algorithm based on the synchronization used
                 in a program, and utilizes efficient compiler
                 instrumentation to reduce performance overheads. Across
                 53 benchmarks, we find that using CURD incurs an aver-
                 age slowdown of just 2.88x over native execution. CURD
                 is 2.1x faster than Nvidia's CUDA-Racecheck race
                 detector, despite detecting a much broader class of
                 races. CURD finds 35 races across our benchmarks,
                 including bugs in established benchmark suites and in
                 sample programs from Nvidia.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '18 proceedings.",
}

@Article{Pessoa:2018:GAB,
  author =       "Tiago Carneiro Pessoa and Jan Gmys and Francisco Heron
                 de Carvalho J{\'u}nior and Nouredine Melab and Daniel
                 Tuyttens",
  title =        "{GPU}-accelerated backtracking using {CUDA Dynamic
                 Parallelism}",
  journal =      j-CCPE,
  volume =       "30",
  number =       "9",
  pages =        "",
  day =          "10",
  month =        may,
  year =         "2018",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4374",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Aug 4 10:03:13 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4374",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Pierro:2018:SFP,
  author =       "Vincenzo Pierro and Luigi Troiano and Elena Mejuto and
                 Giovanni Filatrella",
  title =        "Stochastic first passage time accelerated with
                 {CUDA}",
  journal =      j-J-COMPUT-PHYS,
  volume =       "361",
  number =       "??",
  pages =        "136--149",
  day =          "15",
  month =        may,
  year =         "2018",
  CODEN =        "JCTPAH",
  DOI =          "https://doi.org/10.1016/j.jcp.2018.01.039",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Wed Mar 21 16:14:42 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2015.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999118300494",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Pinho:2018:CTM,
  author =       "Luis Miguel Pinho and Eduardo Qui{\~n}ones and Sara
                 Royuela",
  title =        "Combining the tasklet model with {OpenMP}",
  journal =      j-SIGADA-LETTERS,
  volume =       "38",
  number =       "1",
  pages =        "14--18",
  month =        jun,
  year =         "2018",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/3241950.3241952",
  ISSN =         "0736-721X",
  bibdate =      "Sat Oct 19 17:57:55 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigada.bib",
  abstract =     "Previous workshops have discussed a proposal to
                 augment Ada with fine-grained parallelism, based on the
                 notion of tasklets, a lightweight parallel entity.
                 Recent works have shown the convergence of this model
                 with the OpenMP tasking model and have proposed their
                 coexistence. In this paper we provide a status of the
                 existent works, and describe how these models could be
                 combined.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGADA Ada Letters",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J32",
}

@Article{Poirier:2018:DAB,
  author =       "Carl Poirier and Benoit Gosselin and Paul Fortier",
  title =        "{DNA} Assembly with {de Bruijn} Graphs Using an {FPGA}
                 Platform",
  journal =      j-TCBB,
  volume =       "15",
  number =       "3",
  pages =        "1003--1009",
  month =        may,
  year =         "2018",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2017.2696522",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Sat Jun 30 09:34:37 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "This paper presents an FPGA implementation of a DNA
                 assembly algorithm, called Ray, initially developed to
                 run on parallel CPUs. The OpenCL language is used and
                 the focus is placed on modifying and optimizing the
                 original algorithm to better suit the new
                 parallelization tool and the radically different
                 hardware architecture. The results show that the
                 execution time is roughly one fourth that of the CPU
                 and factoring energy consumption yields a tenfold
                 savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Prabhu:2018:DRC,
  author =       "Tarun Prabhu and William Gropp",
  title =        "{DAME}: Runtime-compilation for data movement",
  journal =      j-IJHPCA,
  volume =       "32",
  number =       "5",
  pages =        "760--774",
  year =         "2018",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342017695444",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Mon Nov 5 17:34:17 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://journals.sagepub.com/doi/full/10.1177/1094342017695444",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
  xxmonth =      sep,
}

@Article{Ramesh:2018:MPE,
  author =       "Srinivasan Ramesh and Aur{\`e}le Mah{\'e}o and Sameer
                 Shende and Allen D. Malony and Hari Subramoni and Amit
                 Ruhela and Dhabaleswar K. (DK) Panda",
  title =        "{MPI} performance engineering with the {MPI} tool
                 interface: the integration of {MVAPICH} and {TAU}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "77",
  number =       "??",
  pages =        "19--37",
  month =        sep,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.05.003",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jan 7 15:25:20 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118301479",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Rasch:2018:MDH,
  author =       "Ari Rasch and Sergei Gorlatch",
  title =        "Multi-dimensional Homomorphisms and Their
                 Implementation in {OpenCL}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "46",
  number =       "1",
  pages =        "101--119",
  month =        feb,
  year =         "2018",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-017-0508-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sun Feb 11 08:41:13 MST 2018",
  bibsource =    "http://link.springer.com/journal/10766/46/1;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Riebler:2018:ACA,
  author =       "Heinrich Riebler and Gavin Vaz and Tobias Kenter and
                 Christian Plessl",
  title =        "Automated code acceleration targeting heterogeneous
                 {OpenCL} devices",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "417--418",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Accelerators can offer exceptional performance
                 advantages. However, programmers need to spend
                 considerable efforts on acceleration, without knowing
                 how sustainable the employed programming models,
                 languages and tools are. To tackle this challenge, we
                 propose and demonstrate a new runtime system called HT
                 rOP that is able to automatically generate and execute
                 OpenCL code from sequential CPU code. HTrOP transforms
                 suitable data-parallel loops into independent
                 OpenCL-typical work-items and handles concrete calls to
                 these devices through a mix of library components and
                 application-specific OpenCL host code. Computational
                 hotspots are identified and can be offloaded to
                 different resources (CPU, GPGPU and Xeon Phi). We
                 demonstrate the potential of HTrOP on a broad set of
                 applications and are able to improve the performance by
                 4.3X on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Rivas-Gomez:2018:MWS,
  author =       "Sergio Rivas-Gomez and Roberto Gioiosa and Ivy Bo Peng
                 and Gokcen Kestor and Sai Narasimhamurthy and Erwin
                 Laure and Stefano Markidis",
  title =        "{MPI} windows on storage for {HPC} applications",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "77",
  number =       "??",
  pages =        "38--56",
  month =        sep,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.05.007",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jan 7 15:25:20 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118301571",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Rucci:2018:OOS,
  author =       "Enzo Rucci and Carlos Garcia and Guillermo Botella and
                 Armando E. {De Giusti} and Marcelo Naiouf and Manuel
                 Prieto-Matias",
  title =        "{OSWALD}: {OpenCL} {Smith--Waterman} on {Altera}'s
                 {FPGA} for Large Protein Databases",
  journal =      j-IJHPCA,
  volume =       "32",
  number =       "3",
  pages =        "337--350",
  month =        may,
  year =         "2018",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Mon Nov 5 17:34:16 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Schmitt:2018:RHG,
  author =       "Christian Schmitt and Moritz Schmid and Sebastian
                 Kuckuk and Harald K{\"o}stler and J{\"u}rgen Teich and
                 Frank Hannig",
  title =        "Reconfigurable Hardware Generation of Multigrid
                 Solvers with Conjugate Gradient Coarse-Grid Solution",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "28",
  number =       "04",
  pages =        "??--??",
  month =        dec,
  year =         "2018",
  DOI =          "https://doi.org/10.1142/S0129626418500160",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:05 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626418500160",
  abstract =     "Not only in the field of high-performance computing
                 (HPC), field programmable gate arrays (FPGAs) are a
                 soaringly popular accelerator technology. However, they
                 use a completely different programming paradigm and
                 tool set compared to central processing units (CPUs) or
                 even graphics processing units (GPUs), adding extra
                 development steps and requiring special knowledge,
                 hindering widespread use in scientific computing. To
                 bridge this programmability gap, domain-specific
                 languages (DSLs) are a popular choice to generate
                 low-level implementations from an abstract algorithm
                 description. In this work, we demonstrate our approach
                 for the generation of numerical solver implementations
                 based on the multigrid method for FPGAs from the same
                 code base that is also used to generate code for CPUs
                 using a hybrid parallelization of MPI and OpenMP. Our
                 approach yields in a hardware design that can compute
                 up to 11 V-cycles per second with an input grid size of
                 4096 {\texttimes} \{\texttimes} {\texttimes} 4096 and
                 solution on the coarsest using the conjugate gradient
                 (CG) method on a mid-range FPGA, beating vectorized,
                 multi-threaded execution on an Intel Xeon processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Si:2018:DAA,
  author =       "Min Si and Antonio J. Pena and Jeff Hammond and Pavan
                 Balaji and Masamichi Takagi and Yutaka Ishikawa",
  title =        "Dynamic Adaptable Asynchronous Progress Model for
                 {MPI} {RMA} Multiphase Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "29",
  number =       "9",
  pages =        "1975--1989",
  month =        sep,
  year =         "2018",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2815568",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Aug 9 10:52:00 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2018/09/08315136-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Snir:2018:FMT,
  author =       "Marc Snir",
  title =        "The future of {MPI}: technical perspective",
  journal =      j-CACM,
  volume =       "61",
  number =       "10",
  pages =        "105--105",
  month =        oct,
  year =         "2018",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/3264415",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Thu Sep 27 11:55:45 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://cacm.acm.org/magazines/2018/10/231376/fulltext",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Sojka:2018:IEM,
  author =       "Radim Sojka and David Hor{\'a}k and V{\'a}clav Hapla
                 and Martin Cerm{\'a}k",
  title =        "The impact of enabling multiple subdomains per {MPI}
                 process in the {TFETI} domain decomposition method",
  journal =      j-APPL-MATH-COMP,
  volume =       "319",
  number =       "??",
  pages =        "586--597",
  day =          "15",
  month =        feb,
  year =         "2018",
  CODEN =        "AMHCBQ",
  DOI =          "https://doi.org/10.1016/j.amc.2017.07.031",
  ISSN =         "0096-3003 (print), 1873-5649 (electronic)",
  ISSN-L =       "0096-3003",
  bibdate =      "Wed Nov 15 17:37:14 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/applmathcomput2015.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0096300317304927",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Mathematics and Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00963003",
}

@Article{Sotiriou-Xanthopoulos:2018:OBV,
  author =       "Efstathios Sotiriou-Xanthopoulos and Leonard Masing
                 and Sotirios Xydis and Kostas Siozios and J{\"U}rgen
                 Becker and Dimitrios Soudris",
  title =        "{OpenCL}-based Virtual Prototyping and Simulation of
                 Many-Accelerator Architectures",
  journal =      j-TECS,
  volume =       "17",
  number =       "5",
  pages =        "86:1--86:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242179",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:41 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3242179",
  abstract =     "Heterogeneous architectures featuring multiple
                 hardware accelerators have been proposed as a promising
                 solution for meeting the ever-increasing performance
                 and power requirements of embedded systems. However,
                 the existence of numerous design parameters may result
                 in different architectural schemes and thus in extra
                 design effort. To address this issue, OpenCL-based
                 frameworks have been recently utilized for FPGA
                 programming, to enable the portability of a source code
                 to multiple architectures. However, such OpenCL
                 frameworks focus on RTL design, thus not enabling rapid
                 prototyping and abstracted modeling of complex systems.
                 Virtual Prototyping aims to overcome this problem by
                 enabling the system modeling in higher abstraction
                 levels. This article combines the benefits of OpenCL
                 and Virtual Prototyping, by proposing an OpenCL-based
                 prototyping framework for data-parallel
                 many-accelerator systems, which (a) creates a SystemC
                 Virtual Platform from OpenCL, (b) provides a
                 co-simulation environment for the host and the Virtual
                 Platform, (c) offers memory and interconnection models
                 for parallel data processing, and (d) enables the
                 system evaluation with alternative real number
                 representations (e.g., fixed-point or 16-bit
                 floating-point).",
  acknowledgement = ack-nhfb,
  articleno =    "86",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Stpiczynski:2018:LBV,
  author =       "Przemys{\l}aw Stpiczy{\'n}ski",
  title =        "Language-based vectorization and parallelization using
                 intrinsics, {OpenMP}, {TBB} and {Cilk Plus}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "74",
  number =       "4",
  pages =        "1461--1472",
  month =        apr,
  year =         "2018",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2231-3",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:11 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/74/4;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/s11227-017-2231-3.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Tagliavini:2018:UFG,
  author =       "Giuseppe Tagliavini and Daniele Cesarini and Andrea
                 Marongiu",
  title =        "Unleashing Fine-Grained Parallelism on Embedded
                 Many-Core Accelerators with Lightweight {OpenMP}
                 Tasking",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "29",
  number =       "9",
  pages =        "2150--2163",
  month =        sep,
  year =         "2018",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2814602",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Aug 9 10:52:00 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2018/09/08314096-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Tellez-Velazquez:2018:CSI,
  author =       "Arturo T{\'e}llez-Vel{\'a}zquez and Ra{\'u}l
                 Cruz-Barbosa",
  title =        "A {CUDA}-streams inference machine for non-singleton
                 fuzzy systems",
  journal =      j-CCPE,
  volume =       "30",
  number =       "8",
  pages =        "",
  day =          "25",
  month =        apr,
  year =         "2018",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4382",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Aug 4 10:03:13 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4382",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Tong:2018:FCM,
  author =       "Zhou Tong and Scott Pakin and Michael Lang and Xin
                 Yuan",
  title =        "Fast classification of {MPI} applications using
                 {Lamport}'s logical clocks",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "120",
  number =       "??",
  pages =        "77--88",
  month =        oct,
  year =         "2018",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2018.05.005",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Aug 10 09:10:45 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151830340X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Tracy:2018:CMC,
  author =       "Fred Thomas Tracy and Thomas C. Oppe and Maureen K.
                 Corcoran",
  title =        "A comparison of {MPI} and co-array {FORTRAN} for large
                 finite element variably saturated flow simulations",
  journal =      j-SCPE,
  volume =       "19",
  number =       "4",
  pages =        "423--432",
  month =        "????",
  year =         "2018",
  CODEN =        "????",
  ISSN =         "1895-1767",
  ISSN-L =       "1895-1767",
  bibdate =      "Mon Jan 7 06:46:51 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/scpe.bib",
  URL =          "https://www.scpe.org/index.php/scpe/article/view/1468",
  acknowledgement = ack-nhfb,
  fjournal =     "Scalable Computing: Practice and Experience",
  journal-URL =  "http://www.scpe.org/",
}

@Article{Valero-Lara:2018:CCC,
  author =       "Pedro Valero-Lara and Ivan Mart{\'\i}nez-P{\'e}rez and
                 Ra{\"u}l Sirvent and Xavier Martorell and Antonio J.
                 Pe{\~n}a",
  title =        "{cuThomasBatch} and {cuThomasVBatch}, {CUDA} routines
                 to compute batch of tridiagonal systems on {NVIDIA
                 GPUs}",
  journal =      j-CCPE,
  volume =       "30",
  number =       "24",
  pages =        "e4909:1--e4909:??",
  day =          "25",
  month =        dec,
  year =         "2018",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4909",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Mar 28 08:07:53 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "27 August 2018",
}

@Article{Villaverde:2018:PTI,
  author =       "Alejandro F. Villaverde and Kolja Becker and Julio R.
                 Banga",
  title =        "{PREMER}: a Tool to Infer Biological Networks",
  journal =      j-TCBB,
  volume =       "15",
  number =       "4",
  pages =        "1193--1202",
  month =        jul,
  year =         "2018",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2017.2758786",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Thu Nov 8 06:18:45 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "Inferring the structure of unknown cellular networks
                 is a main challenge in computational biology.
                 Data-driven approaches based on information theory can
                 determine the existence of interactions among network
                 nodes automatically. However, the elucidation of
                 certain features-such as distinguishing between direct
                 and indirect interactions or determining the direction
                 of a causal link-requires estimating
                 information-theoretic quantities in a multidimensional
                 space. This can be a computationally demanding task,
                 which acts as a bottleneck for the application of
                 elaborate algorithms to large-scale network inference
                 problems. The computational cost of such calculations
                 can be alleviated by the use of compiled programs and
                 parallelization. To this end, we have developed PREMER
                 Parallel Reverse Engineering with Mutual information \&
                 Entropy Reduction, a software toolbox that can run in
                 parallel and sequential environments. It uses
                 information theoretic criteria to recover network
                 topology and determine the strength and causality of
                 interactions, and allows incorporating prior knowledge,
                 imputing missing data, and correcting outliers. PREMER
                 is a free, open source software tool that does not
                 require any commercial software. Its core algorithms
                 are programmed in FORTRAN 90 and implement OpenMP
                 directives. It has user interfaces in Python and
                 MATLAB/Octave, and runs on Windows, Linux, and OSX
                 https://sites.google.com/site/premertoolbox/.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Wolfe:2018:MLS,
  author =       "Noah Wolfe and Misbah Mubarak and Christopher D.
                 Carothers and Robert B. Ross and Philip H. Carns",
  title =        "Modeling Large-Scale Slim Fly Networks Using Parallel
                 Discrete-Event Simulation",
  journal =      j-TOMACS,
  volume =       "28",
  number =       "4",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2018",
  CODEN =        "ATMCEZ",
  DOI =          "https://doi.org/10.1145/3203406",
  ISSN =         "1049-3301 (print), 1558-1195 (electronic)",
  ISSN-L =       "1049-3301",
  bibdate =      "Mon Feb 4 19:19:05 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tomacs.bib",
  abstract =     "As supercomputers approach exascale performance, the
                 increased number of processors translates to an
                 increased demand on the underlying network
                 interconnect. The slim fly network topology, a new
                 low-diameter, low-latency, and low-cost interconnection
                 network, is gaining interest as one possible solution
                 for next-generation supercomputing interconnect
                 systems. In this article, we present a high-fidelity
                 slim fly packet-level model leveraging the Rensselaer
                 Optimistic Simulation System (ROSS) and Co-Design of
                 Exascale Storage (CODES) frameworks. We validate the
                 model with published work before scaling the network
                 size up to an unprecedented 1 million compute nodes and
                 confirming that the slim fly observes peak network
                 throughput at extreme scale. In addition to synthetic
                 workloads, we evaluate large-scale slim fly models with
                 real communication workloads from applications in the
                 Design Forward program with over 110,000 MPI processes.
                 We show strong scaling of the slim fly model on an
                 Intel cluster achieving a peak network packet transfer
                 rate of 2.3 million packets per second and processing
                 over 7 billion discrete events using 128 MPI tasks.
                 Enabled by the strong performance capabilities of the
                 model, we perform a detailed application trace and
                 routing protocol performance study. Through analysis of
                 metrics such as packet latency, hop count, and
                 congestion, we find that the slim fly network is able
                 to leverage simple minimal routing and achieve the same
                 performance as more complex adaptive routing for tested
                 DOE benchmark applications.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Modeling and Computer Simulation",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J781",
}

@Article{Wolfe:2018:ODM,
  author =       "Michael Wolfe and Seyong Lee and Jungwon Kim and
                 Xiaonan Tian and Rengan Xu and Barbara Chapman and
                 Sunita Chandrasekaran",
  title =        "The {OpenACC} data model: Preliminary study on its
                 major challenges and implementations",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "78",
  number =       "??",
  pages =        "15--27",
  month =        oct,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.07.003",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jan 7 15:25:20 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118302175",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Yamazaki:2018:SIL,
  author =       "Ichitaro Yamazaki and Jakub Kurzak and Panruo Wu and
                 Mawussi Zounon and Jack Dongarra",
  title =        "Symmetric Indefinite Linear Solver Using {OpenMP} Task
                 on Multicore Architectures",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "29",
  number =       "8",
  pages =        "1879--1892",
  month =        aug,
  year =         "2018",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2808964",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jul 25 09:07:14 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2018/08/08301559-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Yviquel:2018:CPU,
  author =       "Herv{\'e} Yviquel and Lauro Cruz and Guido Araujo",
  title =        "Cluster Programming using the {OpenMP} Accelerator
                 Model",
  journal =      j-TACO,
  volume =       "15",
  number =       "3",
  pages =        "35:1--35:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226112",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3226112",
  abstract =     "Computation offloading is a programming model in which
                 program fragments (e.g., hot loops) are annotated so
                 that their execution is performed in dedicated hardware
                 or accelerator devices. Although offloading has been
                 extensively used to move computation to GPUs, through
                 directive-based annotation standards like OpenMP,
                 offloading computation to very large computer clusters
                 can become a complex and cumbersome task. It typically
                 requires mixing programming models (e.g., OpenMP and
                 MPI) and languages (e.g., C/C++ and Scala), dealing
                 with various access control mechanisms from different
                 cloud providers (e.g., AWS and Azure), and integrating
                 all this into a single application. This article
                 introduces computer cluster nodes as simple OpenMP
                 offloading devices that can be used either from a local
                 computer or from the cluster head-node. It proposes a
                 methodology that transforms OpenMP directives to Spark
                 runtime calls with fully integrated communication
                 management, in a way that a cluster appears to the
                 programmer as yet another accelerator device.
                 Experiments using LLVM 3.8, OpenMP 4.5 on well known
                 cloud infrastructures (Microsoft Azure and Amazon EC2)
                 show the viability of the proposed approach, enable a
                 thorough analysis of its performance, and make a
                 comparison with an MPI implementation. The results show
                 that although data transfers can impose overheads,
                 cloud offloading from a local machine can still achieve
                 promising speedups for larger granularity: up to 115$
                 \times $ in 256 cores for the 2MM benchmark using 1GB
                 sparse matrices. In addition, the parallel
                 implementation of a complex and relevant scientific
                 application reveals a 80$ \times $ speedup on a 320
                 core machine when executed directly from the headnode
                 of the cluster.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zha:2018:LSM,
  author =       "Yue Zha and Jing Li",
  title =        "{Liquid Silicon-Monona}: a Reconfigurable
                 Memory-Oriented Computing Fabric with Scalable
                 Multi-Context Support",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "214--228",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173167",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the recent trend of promoting Field-Programmable
                 Gate Arrays (FPGAs) to first-class citizens in
                 accelerating compute-intensive applications in
                 networking, cloud services and artificial intelligence,
                 FPGAs face two major challenges in sustaining
                 competitive advantages in performance and energy
                 efficiency for diverse cloud workloads: (1) limited
                 configuration capability for supporting light-weight
                 computations/on-chip data storage to accelerate
                 emerging search-/data-intensive applications. (2) lack
                 of architectural support to hide reconfiguration
                 overhead for assisting virtualization in a cloud
                 computing environment. In this paper, we propose a
                 reconfigurable memory-oriented computing fabric, namely
                 Liquid Silicon-Monona (L-Si), enabled by emerging
                 nonvolatile memory technology i.e. RRAM, to address
                 these two challenges. Specifically, L-Si addresses the
                 first challenge by virtue of a new architecture
                 comprising a 2D array of physically identical but
                 functionally-configurable building blocks. It, for the
                 first time, extends the configuration capabilities of
                 existing FPGAs from computation to the whole spectrum
                 ranging from computation to data storage. It allows
                 users to better customize hardware by flexibly
                 partitioning hardware resources between computation and
                 memory, greatly benefiting emerging search- and
                 data-intensive applications. To address the second
                 challenge, L-Si provides scalable multi-context
                 architectural support to minimize reconfiguration
                 overhead for assisting virtualization. In addition, we
                 provide compiler support to facilitate the programming
                 of applications written in high-level programming
                 languages (e.g. OpenCL) and frameworks (e.g.
                 TensorFlow, MapReduce) while fully exploiting the
                 unique architectural capability of L-Si. Our evaluation
                 results show L-Si achieves 99.6\% area reduction, 1.43$
                 \times $ throughput improvement and 94.0\% power
                 reduction on search-intensive benchmarks, as compared
                 with the FPGA baseline. For neural network benchmarks,
                 on average, L-Si achieves 52.3$ \times $ speedup,
                 113.9$ \times $ energy reduction and 81\% area
                 reduction over the FPGA baseline. In addition, the
                 multi-context architecture of L-Si reduces the context
                 switching time to --- 10ns, compared with an
                 off-the-shelf FPGA (~100ms), greatly facilitating
                 virtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Zhang:2018:IRP,
  author =       "Xuechen Zhang and Song Jiang and Alseny Diallo and Lei
                 Wang",
  title =        "{IR+}: Removing parallel {I/O} interference of {MPI}
                 programs via data replication over heterogeneous
                 storage devices",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "76",
  number =       "??",
  pages =        "91--105",
  month =        aug,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.01.004",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jun 4 07:40:18 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118300140",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Adam:2019:CRA,
  author =       "Julien Adam and Maxime Kermarquer and Jean-Baptiste
                 Besnard and Leonardo Bautista-Gomez and Marc
                 P{\'e}rache and Patrick Carribault and Julien Jaeger
                 and Allen D. Malony and Sameer Shende",
  title =        "Checkpoint\slash restart approaches for a thread-based
                 {MPI} runtime",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "204--219",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2019.02.006",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303247",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Al-Shorman:2019:UPP,
  author =       "Mohammad Y. Al-Shorman and Majd M. Al-Kofahi",
  title =        "Ultrasonic pulse propagation simulation using {OpenCL}
                 for environment mapping and discovery",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "5",
  pages =        "1019--1029",
  day =          "1",
  month =        sep,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342019846290",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:54 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342019846290",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Awan:2019:OLM,
  author =       "Ammar Ahmad Awan and Karthik Vadambacheri Manian and
                 Ching-Hsiang Chu and Hari Subramoni and Dhabaleswar K.
                 Panda",
  title =        "Optimized large-message broadcast for deep learning
                 workloads: {MPI}, {MPI + NCCL}, or {NCCL2}?",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "141--152",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303284",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Badia:2019:ASP,
  author =       "Jose M. Bad{\'{\i}}a and Jose A. Belloch and Maximo
                 Cobos and Francisco D. Igual and Enrique S.
                 Quintana-Ort{\'{\i}}",
  title =        "Accelerating the {SRP--PHAT} algorithm on multi- and
                 many-core platforms using {OpenCL}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "3",
  pages =        "1284--1297",
  month =        mar,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-018-2422-6",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:17 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Balaji:2019:SIM,
  author =       "Pavan Balaji and Marc Casas",
  title =        "Special issue on the Message Passing Interface",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "86",
  number =       "??",
  pages =        "14--15",
  month =        aug,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S016781911930095X",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Boschetti:2019:MOD,
  author =       "Marco Antonio Boschetti and Vittorio Maniezzo and
                 Francesco Strappaveccia",
  title =        "Membership overlay design optimization with resource
                 constraints (accelerated on {GPU})",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "133",
  number =       "??",
  pages =        "286--296",
  month =        nov,
  year =         "2019",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Sep 13 10:25:21 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731518304908",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Brown:2019:LMR,
  author =       "Nick Brown and Michael Bareford and Mich{\`e}le
                 Weiland",
  title =        "Leveraging {MPI} {RMA} to optimize halo-swapping
                 communications in {MONC} on {Cray} machines",
  journal =      j-CCPE,
  volume =       "31",
  number =       "16",
  pages =        "e5008:1--e5008:??",
  day =          "25",
  month =        aug,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5008",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Oct 12 11:00:04 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "25 September 2018",
}

@Article{Budiardja:2019:TGO,
  author =       "Reuben D. Budiardja and Christian Y. Cardall",
  title =        "Targeting {GPUs} with {OpenMP} directives on {Summit}:
                 a simple and effective {Fortran} experience",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "88",
  number =       "??",
  pages =        "Article 102544",
  month =        "????",
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:02 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819119301358",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Cadenelli:2019:CUO,
  author =       "Nicola Cadenelli and Zoran Jak{\v{s}}i{\'c} and
                 Jord{\`a} Polo and David Carrera",
  title =        "Considerations in using {OpenCL} on {GPUs} and {FPGAs}
                 for throughput-oriented genomics workloads",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "94",
  number =       "??",
  pages =        "148--159",
  month =        may,
  year =         "2019",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Oct 14 16:09:56 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X18314183",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Candel:2019:EMC,
  author =       "F. Candel and A. Valero and S. Petit and J.
                 Sahuquillo",
  title =        "Efficient Management of Cache Accesses to Boost
                 {GPGPU} Memory Subsystem Performance",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "68",
  number =       "10",
  pages =        "1442--1454",
  month =        oct,
  year =         "2019",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2019.2907591",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Sep 12 13:33:25 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "Analytical models; Energy consumption; GPU; Graphics
                 processing units; Instruction sets; memory hierarchy;
                 Memory management; miss management; Proposals",
}

@Article{Chen:2019:STG,
  author =       "Yong Chen and Weijia Shang",
  title =        "Supernode transformation on {GPGPUs}",
  journal =      j-INT-J-PAR-EMER-DIST-SYS,
  volume =       "34",
  number =       "2",
  pages =        "181--202",
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1080/17445760.2017.1296147",
  ISSN =         "1744-5760 (print), 1744-5779 (electronic)",
  ISSN-L =       "1744-5760",
  bibdate =      "Tue Sep 10 15:30:02 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/intjparemerdistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.tandfonline.com/toc/gpaa20/34/2",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel, Emergent and
                 Distributed Systems: IJPEDS",
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
  onlinedate =   "06 Apr 2018",
}

@Article{Chikin:2019:MAA,
  author =       "Artem Chikin and Taylor Lloyd and Jos{\'e} Nelson
                 Amaral and Ettore Tiotto and Muhammad Usman",
  title =        "Memory-access-aware Safety and Profitability Analysis
                 for Transformation of Accelerator-bound {OpenMP}
                 Loops",
  journal =      j-TACO,
  volume =       "16",
  number =       "3",
  pages =        "30:1--30:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3333060",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iteration Point Difference Analysis is a new static
                 analysis framework that can be used to determine the
                 memory coalescing characteristics of parallel loops
                 that target GPU offloading and to ascertain safety and
                 profitability of loop transformations with the goal of
                 improving their memory access characteristics. This
                 analysis can propagate definitions through control
                 flow, works for non-affine expressions, and is capable
                 of analyzing expressions that reference conditionally
                 defined values. This analysis framework enables safe
                 and profitable loop transformations. Experimental
                 results demonstrate potential for dramatic performance
                 improvements. GPU kernel execution time across the
                 Polybench suite is improved by up to $ 25.5 \times $ on
                 an Nvidia P100 with benchmark overall improvement of up
                 to $ 3.2 \times $. An opportunity detected in a SPEC
                 ACCEL benchmark yields kernel speedup of $ 86.5 \times
                 $ with a benchmark improvement of $ 3.3 \times $. This
                 work also demonstrates how architecture-aware compilers
                 improve code portability and reduce programmer
                 effort.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ciglaric:2019:OLP,
  author =       "Tadej Ciglaric and Rok Cesnovar and Erik Strumbelj",
  title =        "An {OpenCL} library for parallel random number
                 generators",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "7",
  pages =        "3866--3881",
  month =        jul,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02756-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:20 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/7;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Clauser:2019:FFO,
  author =       "C. F. Clauser and R. Farengo and H. E. Ferrari",
  title =        "{FOCUS}: a full-orbit {CUDA} solver for particle
                 simulations in magnetized plasmas",
  journal =      j-COMP-PHYS-COMM,
  volume =       "234",
  number =       "??",
  pages =        "126--136",
  month =        jan,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.07.018",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Oct 16 18:11:50 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518302753",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Coronado-Barrientos:2019:ANF,
  author =       "E. Coronado-Barrientos and G. Indalecio and A.
                 Garc{\'\i}a-Loureiro",
  title =        "{AXC}: a new format to perform the {SpMV} oriented to
                 {Intel Xeon Phi} architecture in {OpenCL}",
  journal =      j-CCPE,
  volume =       "31",
  number =       "1",
  pages =        "e4864:1--e4864:??",
  day =          "10",
  month =        jan,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4864",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Mar 28 08:07:54 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "31 July 2018",
}

@Article{Crivellini:2019:OPS,
  author =       "Andrea Crivellini and Matteo Franciolini",
  title =        "{OpenMP} Parallelization Strategies for a
                 Discontinuous {Galerkin} Solver",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "47",
  number =       "5--6",
  pages =        "838--873",
  month =        dec,
  year =         "2019",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-018-0589-3",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jul 25 06:58:52 MDT 2020",
  bibsource =    "http://link.springer.com/journal/10766/47/5;
                 http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s10766-018-0589-3",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Daberdaku:2019:ACT,
  author =       "Sebastian Daberdaku",
  title =        "Accelerating the computation of triangulated molecular
                 surfaces with {OpenMP}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "7",
  pages =        "3426--3470",
  month =        jul,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02803-y",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:20 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/7;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Dalcin:2019:FPM,
  author =       "Lisandro Dalcin and Mikael Mortensen and David E.
                 Keyes",
  title =        "Fast parallel multidimensional {FFT} using advanced
                 {MPI}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "128",
  number =       "??",
  pages =        "137--150",
  month =        jun,
  year =         "2019",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2019.02.006",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon May 20 18:06:40 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151830306X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Deng:2019:CBV,
  author =       "Y. Deng and T. Li and Y. Luo and X. Zhao",
  title =        "{CUDA}-Based Volume Rendering and Inspection for
                 Time-Varying Ultrasonic Testing Datasets",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "21",
  number =       "5",
  pages =        "76--86",
  month =        sep # "\slash " # oct,
  year =         "2019",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2018.2875319",
  ISSN =         "1521-9615 (print), 1558-366x (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Mon Aug 19 06:40:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See corrections \cite{Deng:2020:CCB}.",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
  keywords =     "Acoustics; Data visualization; Image color analysis;
                 Real-time systems; Rendering (computer graphics);
                 Three-dimensional displays; Transfer functions",
}

@Article{Denis:2019:SPT,
  author =       "Alexandre Denis and Julien Jaeger and Emmanuel Jeannot
                 and Marc P{\'e}rache and Hugo Taboada",
  title =        "Study on progress threads placement and dedicated
                 cores for overlapping {MPI} nonblocking collectives on
                 manycore processor",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "6",
  pages =        "1240--1254",
  day =          "1",
  month =        nov,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342019860184",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:54 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342019860184",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Deveci:2019:GMT,
  author =       "M. Deveci and K. D. Devine and K. Pedretti and M. A.
                 Taylor and S. Rajamanickam and {\"U}. V.
                 {\c{C}}ataly{\"u}rek",
  title =        "Geometric Mapping of Tasks to Processors on Parallel
                 Computers with Mesh or Torus Networks",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "9",
  pages =        "2018--2032",
  month =        sep,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2900043",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "algorithmic optimizations; application program
                 interfaces; Bandwidth; communication interdependence;
                 contiguous allocation; contiguous block; Cray XK7;
                 E3SM/HOMME; finite difference methods; geometric
                 mapping; geometric partitioning; geometric partitioning
                 algorithm; geometric proximity; IBM BlueGene/Q; jagged
                 partitioning; load balancing; Measurement; mesh
                 networks; message passing; MiniGhost default mapping;
                 MPI tasks; multiprocessing systems; Network topology;
                 noncontiguous allocations; optimisation; parallel
                 applications; parallel computers; parallel machines;
                 Partitioning algorithms; processors; Program
                 processors; recursive bisection; resource allocation;
                 Resource management; sparse allocation; sparse node
                 allocation; spatial partitioning; structured finite
                 difference mini-application; Task analysis; Task
                 mapping; torus networks",
}

@Article{Diaz:2019:AOO,
  author =       "Jose Monsalve Diaz and Kyle Friedline and Swaroop
                 Pophale and Oscar Hernandez and David E. Bernholdt and
                 Sunita Chandrasekaran",
  title =        "Analysis of {OpenMP 4.5} Offloading in
                 Implementations: Correctness and Overhead",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "89",
  number =       "??",
  pages =        "Article 102546",
  month =        nov,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2019.102546",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Mar 29 11:35:58 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819119301371",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dieguez:2019:TPR,
  author =       "Adri{\'a}n P. Di{\'e}guez and Margarita Amor and
                 Ram{\'o}n Doallo",
  title =        "Tree Partitioning Reduction: A New Parallel Partition
                 Method for Solving Tridiagonal Systems",
  journal =      j-TOMS,
  volume =       "45",
  number =       "3",
  pages =        "31:1--31:26",
  month =        aug,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3328731",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Sep 3 17:49:22 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3328731",
  abstract =     "Solving tridiagonal linear-equation systems is a
                 fundamental computing kernel in a wide range of
                 scientific and engineering applications, and its
                 computation can be modeled with parallel algorithms.
                 These parallel solvers are typically designed to
                 compute problems whose data fit in a common
                 shared-memory space where all the cores taking part in
                 the computation have access. However, when the problem
                 size is large, data cannot be entirely stored in the
                 common shared-memory space, and a high number of
                 high-latency communications are performed. One
                 alternative is to partition the problem among different
                 memory spaces. At this point, conventional parallel
                 algorithms do not facilitate the partition of
                 computation in independent tiles, since each reduction
                 depends on equations that may be in different tiles.
                 This article proposes an algorithm based on a tree
                 reduction, called the Tree Partitioning Reduction (TPR)
                 method, which partitions the problem into independent
                 slices that can be partially computed in parallel
                 within different common shared-memory spaces. The TPR
                 method can be implemented for any parallel and
                 distributed programming paradigm. Furthermore, in this
                 work, TPR is efficiently implemented for CUDA GPUs to
                 solve large size problems, providing highly competitive
                 performance results with respect to existing packages,
                 being, on average, 22.03$ \times $ faster than
                 CUSPARSE.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Diep:2019:TSS,
  author =       "Thanh-Dang Diep and Kien Trung Pham and Karl
                 F{\"u}rlinger and Nam Thoai",
  title =        "A time-stamping system to detect memory consistency
                 errors in {MPI} one-sided applications",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "86",
  number =       "??",
  pages =        "36--44",
  month =        aug,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303235",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dongarra:2019:PPL,
  author =       "Jack Dongarra and Mark Gates and Azzam Haidar and
                 Jakub Kurzak and Piotr Luszczek and Panruo Wu and
                 Ichitaro Yamazaki and Asim Yarkhan and Maksims
                 Abalenkovs and Negin Bagherpour and Sven Hammarling and
                 Jakub S{\'\i}stek and David Stevens and Mawussi Zounon
                 and Samuel D. Relton",
  title =        "{PLASMA}: Parallel Linear Algebra Software for
                 Multicore Using {OpenMP}",
  journal =      j-TOMS,
  volume =       "45",
  number =       "2",
  pages =        "16:1--16:35",
  month =        apr,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3264491",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon May 6 18:23:42 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3264491",
  abstract =     "The recent version of the Parallel Linear Algebra
                 Software for Multicore Architectures (PLASMA) library
                 is based on tasks with dependencies from the OpenMP
                 standard. The main functionality of the library is
                 presented. Extensive benchmarks are targeted on three
                 recent multicore and manycore architectures, namely, an
                 Intel Xeon, Intel Xeon Phi, and IBM POWER 8
                 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Doulis:2019:CMP,
  author =       "Georgios Doulis and J{\"o}rg Frauendiener and Chris
                 Stevens and Ben Whale",
  title =        "{COFFEE} --- an {MPI}-parallelized {Python} package
                 for the numerical evolution of differential equations",
  journal =      j-SOFTWAREX,
  volume =       "10",
  number =       "??",
  pages =        "Article 100283",
  month =        jul # "\slash " # dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1016/j.softx.2019.100283",
  ISSN =         "2352-7110",
  ISSN-L =       "2352-7110",
  bibdate =      "Fri Apr 9 16:04:36 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/softwarex.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S2352711019300950",
  acknowledgement = ack-nhfb,
  fjournal =     "SoftwareX",
  journal-URL =  "https://www.sciencedirect.com/journal/softwarex/issues",
}

@Article{Edmonds:2019:HAS,
  author =       "Mark Edmonds and Tanvir Atahary and Scott Douglass and
                 Tarek Taha",
  title =        "Hardware Accelerated Semantic Declarative Memory
                 Systems through {CUDA} and {MapReduce}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "3",
  pages =        "601--614",
  month =        mar,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2866848",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 14 06:19:14 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.computer.org/csdl/trans/td/2019/03/08444694-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Faict:2019:MGI,
  author =       "Thomas Faict and Erik H. D'Hollander and Bart
                 Goossens",
  title =        "Mapping a Guided Image Filter on the {HARP}
                 Reconfigurable Architecture Using {OpenCL}",
  journal =      j-ALGORITHMS-BASEL,
  volume =       "12",
  number =       "8",
  month =        aug,
  year =         "2019",
  CODEN =        "ALGOCH",
  DOI =          "https://doi.org/10.3390/a12080149",
  ISSN =         "1999-4893 (electronic)",
  ISSN-L =       "1999-4893",
  bibdate =      "Thu May 28 08:40:45 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/algorithms.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.mdpi.com/1999-4893/12/8/149",
  acknowledgement = ack-nhfb,
  articleno =    "149",
  fjournal =     "149",
  journal-URL =  "https://www.mdpi.com/",
  pagecount =    "??",
}

@Article{Fan:2019:BPA,
  author =       "Xing Fan and Oliver Sinnen and Nasser Giacaman",
  title =        "Balancing parallelization and asynchronization in
                 event-driven programs with {OpenMP}",
  journal =      j-CCPE,
  volume =       "31",
  number =       "4",
  pages =        "e4959:1--e4959:??",
  day =          "25",
  month =        feb,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4959",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Mar 28 08:07:55 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "21 September 2018",
}

@Article{Fan:2019:SAO,
  author =       "Xing Fan and Oliver Sinnen and Nasser Giacaman",
  title =        "Supporting asynchronization in {OpenMP} for
                 event-driven programming",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "82",
  number =       "??",
  pages =        "57--74",
  month =        "????",
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.03.008",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Mar 12 06:47:09 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118300838",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Figueiredo:2019:MOP,
  author =       "Marco Antonio C. de {Figueiredo, Jr.} and Edans F. de
                 Oliveira Sandes and Genaina N. Rodrigues and George L.
                 M. Teodoro and Alba Cristina M. A. de Melo",
  title =        "{MASA-OpenCL}: Parallel pruned comparison of long
                 {DNA} sequences with {OpenCL}",
  journal =      j-CCPE,
  volume =       "31",
  number =       "11",
  pages =        "e5039:1--e5039:??",
  day =          "10",
  month =        jun,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5039",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Oct 12 11:00:02 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "21 October 2018",
}

@Article{Fujita:2019:EIM,
  author =       "Hajime Fujita and Chongxiao Cao and Sayantan Sur and
                 Charles Archer and Erik Paulson and Maria Garzaran",
  title =        "Efficient implementation of {MPI-3} {RMA} over
                 {openFabrics} interfaces",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "87",
  number =       "??",
  pages =        "1--10",
  month =        sep,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2019.04.008",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:02 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303843",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gittens:2019:AAS,
  author =       "Alex Gittens and Kai Rothauge and Shusen Wang and
                 Michael W. Mahoney and Jey Kottalam and Lisa Gerhardt
                 and Prabhat and Michael Ringenburg and Kristyn
                 Maschhoff",
  title =        "{Alchemist}: an {Apache Spark} $ \leftrightarrow $
                 {MPI} interface",
  journal =      j-CCPE,
  volume =       "31",
  number =       "16",
  pages =        "e5026:1--e5026:??",
  day =          "25",
  month =        aug,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5026",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Oct 12 11:00:04 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 November 2018",
}

@Article{Gloster:2019:CBP,
  author =       "Andrew Gloster and Lennon {\'O} N{\'a}raigh and Khang
                 Ee Pang",
  title =        "{cuPentBatch} --- a batched pentadiagonal solver for
                 {NVIDIA} {GPUs}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "241",
  number =       "??",
  pages =        "113--121",
  month =        aug,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2019.03.016",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue May 14 10:01:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465519300979",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Gloster:2019:CCF,
  author =       "Andrew Gloster and Lennon {{\'O} N{\'a}raigh}",
  title =        "{cuSten} --- {CUDA} finite difference and stencil
                 library",
  journal =      j-SOFTWAREX,
  volume =       "10",
  number =       "??",
  pages =        "Article 100337",
  month =        jul # "\slash " # dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1016/j.softx.2019.100337",
  ISSN =         "2352-7110",
  ISSN-L =       "2352-7110",
  bibdate =      "Fri Apr 9 16:04:36 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/softwarex.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S2352711019300561",
  acknowledgement = ack-nhfb,
  fjournal =     "SoftwareX",
  journal-URL =  "https://www.sciencedirect.com/journal/softwarex/issues",
}

@Article{Gropp:2019:GEI,
  author =       "William Gropp and Rajeev Thakur",
  title =        "{Guest Editor}'s introduction: Special issue on best
                 papers from {EuroMPI\slash USA 2017}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "84",
  number =       "??",
  pages =        "62--62",
  month =        may,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819119300560",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gropp:2019:UNS,
  author =       "William D. Gropp",
  title =        "Using node and socket information to implement {MPI}
                 {Cartesian} topologies",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "98--108",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303156",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gueunet:2019:TBA,
  author =       "C. Gueunet and P. Fortin and J. Jomier and J. Tierny",
  title =        "Task-Based Augmented Contour Trees with {Fibonacci}
                 Heaps",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "8",
  pages =        "1889--1905",
  month =        aug,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2898436",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fibquart.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "computation procedure; contour tree based
                 applications; Data analysis; data segmentation
                 applications; data structures; Data structures; data
                 visualisation; Data visualization; fast shared memory;
                 Fibonacci heaps; independent local tasks; intermediate
                 data structures; join split trees; multi-core
                 architecture; multi-threading; multicore computation;
                 OpenMP task runtime; parallel algorithm; parallel
                 algorithms; Parallel algorithms; parallel thanks;
                 Runtime; Scientific visualization; Task analysis; task
                 parallelism; task-based augmented contour trees;
                 topological data analysis; tree algorithm; trees
                 (mathematics)",
}

@Article{Hajihassani:2019:FAI,
  author =       "O. Hajihassani and S. K. Monfared and S. H. Khasteh
                 and S. Gorgin",
  title =        "Fast {AES} Implementation: A High-Throughput Bitsliced
                 Approach",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "10",
  pages =        "2211--2222",
  month =        oct,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2911278",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Dec 19 09:20:35 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "AES; byte-wise operations; computing process;
                 cryptography; CTR; CUDA; CUDA-enabled GPU; Data models;
                 data representation; data representation scheme; ECB;
                 Encryption; encryption throughput; fast AES
                 implementation; GPU; Graphics processing units;
                 high-performance; high-throughput bitsliced AES
                 implementation; high-throughput bitsliced approach;
                 logic circuits; parallel architectures; parallelization
                 capability; parallelization unit; S-box logic circuit;
                 ShiftRows; Standards; substitute bytes stage; Table
                 lookup; Throughput",
}

@Article{Hermanns:2019:MEI,
  author =       "Marc-Andr{\'e} Hermanns and Nathan T. Hjelm and
                 Michael Knobloch and Kathryn Mohror and Martin Schulz",
  title =        "The {MPI\_T} events interface: an early evaluation and
                 overview of the interface",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "119--130",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303314",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Holmes:2019:PPE,
  author =       "Daniel J. Holmes and Bradley Morgan and Anthony
                 Skjellum and Purushotham V. Bangalore and Srinivas
                 Sridharan",
  title =        "Planning for performance: Enhancing achievable
                 performance for {MPI} through persistent collective
                 operations",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "81",
  number =       "??",
  pages =        "32--57",
  month =        jan,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.08.001",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jan 7 15:25:21 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118302412",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Huckelheim:2019:RMA,
  author =       "Jan H{\"u}ckelheim and Paul Hovland and Michelle Mills
                 Strout and Jens-Dominik M{\"u}ller",
  title =        "Reverse-mode algorithmic differentiation of an
                 {OpenMP}-parallel compressible flow solver",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "1",
  pages =        "140--154",
  day =          "1",
  month =        jan,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342017712060",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:53 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342017712060",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Izadpanah:2019:PAP,
  author =       "Ramin Izadpanah and Benjamin A. Allan and Damian
                 Dechev and Jim Brandt",
  title =        "Production Application Performance Data Streaming for
                 System Monitoring",
  journal =      j-TOMPECS,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3319498",
  ISSN =         "2376-3639",
  bibdate =      "Sat Sep 21 07:21:17 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tompecs.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3319498",
  abstract =     "In this article, we present an approach to streaming
                 collection of application performance data. Practical
                 application performance tuning and troubleshooting in
                 production high-performance computing (HPC)
                 environments requires an understanding of how
                 applications interact with the platform, including (but
                 not limited to) parallel programming libraries such as
                 Message Passing Interface (MPI). Several profiling and
                 tracing tools exist that collect heavy runtime data
                 traces either in memory (released only at application
                 exit) or on a file system (imposing an I/O load that
                 may interfere with the performance being measured).
                 Although these approaches are beneficial in development
                 stages and post-run analysis, a systemwide and
                 low-overhead method is required to monitor deployed
                 applications continuously. This method must be able to
                 collect information at both the application and system
                 levels to yield a complete performance picture. In our
                 approach, an application profiler collects application
                 event counters. A sampler uses an efficient
                 inter-process communication method to periodically
                 extract the application counters and stream them into
                 an infrastructure for performance data collection. We
                 implement a tool-set based on our approach and
                 integrate it with the Lightweight Distributed Metric
                 Service (LDMS) system, a monitoring system used on
                 large-scale computational platforms. LDMS provides the
                 infrastructure to create and gather streams of
                 performance data in a low overhead manner. We
                 demonstrate our approach using applications implemented
                 with MPI, as it is one of the most common standards for
                 the development of large-scale scientific applications.
                 We utilize our tool-set to study the impact of our
                 approach on an open source HPC application, Nalu. Our
                 tool-set enables us to efficiently identify patterns in
                 the behavior of the application without source-level
                 knowledge. We leverage LDMS to collect system-level
                 performance data and explore the correlation between
                 the system and application events. Also, we demonstrate
                 how our tool-set can help detect anomalies with a low
                 latency. We run tests on two different architectures: a
                 system enabled with Intel Xeon Phi and another system
                 equipped with Intel Xeon processor. Our overhead study
                 shows our method imposes at most 0.5\% CPU usage
                 overhead on the application in realistic deployment
                 scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Modeling and Performance
                 Evaluation of Computing Systems (TOMPECS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J1525",
}

@Article{Kallenborn:2019:MPC,
  author =       "Felix Kallenborn and Christian Hundt and Sebastian
                 B{\"o}ser and Bertil Schmidt",
  title =        "Massively parallel computation of atmospheric neutrino
                 oscillations on {CUDA}-enabled accelerators",
  journal =      j-COMP-PHYS-COMM,
  volume =       "234",
  number =       "??",
  pages =        "235--244",
  month =        jan,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.07.022",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Oct 16 18:11:50 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518302790",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Kang:2019:SAM,
  author =       "Qiao Kang and Jesper Larsson Tr{\"a}ff and Reda
                 Al-Bahrani and Ankit Agrawal and Alok Choudhary and
                 Wei-keng Liao",
  title =        "Scalable Algorithms for {MPI} Intergroup {Allgather}
                 and {Allgatherv}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "220--230",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2019.04.015",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S016781911830320X",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Knap:2019:PEU,
  author =       "Marcin Knap and Pawe{\l} Czarnul",
  title =        "Performance evaluation of Unified Memory with
                 prefetching and oversubscription for selected parallel
                 {CUDA} applications on {NVIDIA} {Pascal} and {Volta}
                 {GPUs}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "11",
  pages =        "7625--7645",
  month =        nov,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02966-8",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jul 25 07:17:51 MDT 2020",
  bibsource =    "http://link.springer.com/journal/11227/75/11;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/s11227-019-02966-8.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Knight:2019:TES,
  author =       "Louise Knight and Polona Stefanic and Matej Cigale and
                 Andrew C. Jones and Ian Taylor",
  title =        "Towards extending the {SWITCH} platform for
                 time-critical, cloud-based {CUDA} applications: Job
                 scheduling parameters influencing performance",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "100",
  number =       "??",
  pages =        "542--556",
  month =        nov,
  year =         "2019",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.05.039",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Feb 10 12:55:01 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X18311014",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Kronbichler:2019:FMF,
  author =       "Martin Kronbichler and Katharina Kormann",
  title =        "Fast Matrix-Free Evaluation of Discontinuous
                 {Galerkin} Finite Element Operators",
  journal =      j-TOMS,
  volume =       "45",
  number =       "3",
  pages =        "29:1--29:40",
  month =        aug,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3325864",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Sep 3 17:49:22 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3325864",
  abstract =     "We present an algorithmic framework for matrix-free
                 evaluation of discontinuous Galerkin finite element
                 operators. It relies on fast quadrature with sum
                 factorization on quadrilateral and hexahedral meshes,
                 targeting general weak forms of linear and nonlinear
                 partial differential equations. Different algorithms
                 and data structures are compared in an in-depth
                 performance analysis. The implementations of the local
                 integrals are optimized by vectorization over several
                 cells and faces and an even-odd decomposition of the
                 one-dimensional interpolations. Up to 60\% of the
                 arithmetic peak on Intel Haswell, Broadwell, and
                 Knights Landing processors is reached when running from
                 caches and up to 40\% of peak when also considering the
                 access to vectors from main memory. On 2$ \times $14
                 Broadwell cores, the throughput is up to 2.2 billion
                 unknowns per second for the 3D Laplacian and up to 4
                 billion unknowns per second for the 3D advection on
                 affine geometries, close to a simple copy operation at
                 4.7 billion unknowns per second. Our experiments show
                 that MPI ghost exchange has a considerable impact on
                 performance and we present strategies to mitigate this
                 effect. Finally, various options for evaluating
                 geometry terms and their performance are discussed. Our
                 implementations are publicly available through the
                 deal.II finite element library.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Kumar:2019:FOP,
  author =       "Ramavarmaraja Kishor Kumar and Vladimir Loncar and
                 Paulsamy Muruganandam and Sadhan K. Adhikari and Antun
                 Balaz",
  title =        "{C} and {Fortran} {OpenMP} programs for rotating
                 {Bose--Einstein} condensates",
  journal =      j-COMP-PHYS-COMM,
  volume =       "240",
  number =       "??",
  pages =        "74--82",
  month =        jul,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2019.03.004",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jun 14 08:12:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465519300827",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@TechReport{Laguna:2019:GPD,
  author =       "Ignacio Laguna and Paul C. Wood and Ranvijay Singh and
                 Saurabh Bagchi",
  title =        "{GPUMixer}: Performance-Driven Floating-Point Tuning
                 for {GPU} Scientific Applications",
  type =         "Report",
  institution =  "Lawrence Livermore National Laboratory",
  address =      "Livermore CA 94550, USA",
  year =         "2019",
  bibdate =      "Tue Aug 06 05:54:23 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://lagunaresearch.org/docs/isc-2019.pdf;
                 https://www.hpcwire.com/2019/08/05/llnl-purdue-researchers-harness-gpu-mixed-precision-for-accuracy-performance-tradeoff/",
  abstract =     "We present GPUMixer, a tool to perform mixed-precision
                 floating-point tuning on scientific GPU applications.
                 While precision tuning techniques are available, they
                 are designed for serial programs and are
                 accuracy-driven, i.e., they consider configurations
                 that satisfy accuracy constraints, but these
                 configurations may degrade performance. GPUMixer, in
                 contrast, presents a performance-driven approach for
                 tuning. We introduce a novel static analysis that finds
                 Fast Imprecise Sets (FISets), sets of operations on low
                 precision that minimize type conversions, which often
                 yield performance speedups. To estimate the relative
                 error introduced by GPU mixed-precision, we propose
                 shadow computations analysis for GPUs, the first of
                 this class for multi-threaded applications. GPUMixer
                 obtains performance improvements of up to 46.4\% of the
                 ideal speedup in comparison to only 20.7\% found by
                 state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  remark =       "Best paper award at the 33rd ISC High Performance
                 conference held June 16--20, 2019.",
}

@Article{Levy:2019:USE,
  author =       "Scott Levy and Kurt B. Ferreira and Whit Schonbein and
                 Ryan E. Grant and Matthew G. F. Dosanjh",
  title =        "Using simulation to examine the effect of {MPI}
                 message matching costs on application performance",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "84",
  number =       "??",
  pages =        "63--74",
  month =        may,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303272",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Li:2019:TBH,
  author =       "Bing Li and Mengjie Mao and Xiaoxiao Liu and Tao Liu
                 and Zihao Liu and Wujie Wen and Yiran Chen and Hai
                 (Helen) Li",
  title =        "Thread Batching for High-performance Energy-efficient
                 {GPU} Memory Design",
  journal =      j-JETC,
  volume =       "15",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3330152",
  ISSN =         "1550-4832",
  bibdate =      "Tue Dec 17 07:50:24 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3330152",
  abstract =     "Massive multi-threading in GPU imposes tremendous
                 pressure on memory subsystems. Due to rapid growth in
                 thread-level parallelism of GPU and slowly improved
                 peak memory bandwidth, memory becomes a bottleneck of
                 GPU's performance and energy efficiency. In this
                 article, we propose an integrated architectural scheme
                 to optimize the memory accesses and therefore boost the
                 performance and energy efficiency of GPU. First, we
                 propose a thread batch enabled memory partitioning
                 (TEMP) to improve GPU memory access parallelism. In
                 particular, TEMP groups multiple thread blocks that
                 share the same set of pages into a thread batch and
                 applies a page coloring mechanism to bound each stream
                 multiprocessor (SM) to the dedicated memory banks.
                 After that, TEMP dispatches the thread batch to an SM
                 to ensure high-parallel memory-access streaming from
                 the different thread blocks. Second, a thread
                 batch-aware scheduling (TBAS) scheme is introduced to
                 improve the GPU memory access locality and to reduce
                 the contention on memory controllers and
                 interconnection networks. Experimental results show
                 that the integration of TEMP and TBAS can achieve up to
                 10.3\% performance improvement and 11.3\% DRAM energy
                 reduction across diverse GPU applications. We also
                 evaluate the performance interference of the mixed
                 CPU+GPU workloads when they are run on a heterogeneous
                 system that employs our proposed schemes. Our results
                 show that a simple solution can effectively ensure the
                 efficient execution of both GPU and CPU applications.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lima:2019:PEA,
  author =       "Jo{\~a}o Vicente Ferreira Lima and Issam Ra{\"\i}s and
                 Laurent Lef{\`e}vre and Thierry Gautier",
  title =        "Performance and energy analysis of {OpenMP} runtime
                 systems with dense linear algebra algorithms",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "3",
  pages =        "431--443",
  day =          "1",
  month =        may,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342018792079",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:53 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342018792079",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Liu:2019:MML,
  author =       "Qixiao Liu and Zhifeng Chen and Zhibin Yu",
  title =        "{MiC}: Multi-level Characterization and Optimization
                 of {GPGPU} Kernels",
  journal =      j-JETC,
  volume =       "15",
  number =       "3",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3304108",
  ISSN =         "1550-4832",
  bibdate =      "Fri Nov 29 16:06:01 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3304108",
  abstract =     "Graphics processing units (GPUs)$^1$ have enjoyed
                 increasing popularity in recent years, which benefits
                 from, for example, general-purpose GPU (GPGPU) for
                 parallel programs and new computing paradigms, such as
                 the Internet of Things (IoT). GPUs hold great potential
                 in providing effective solutions for big data analytics
                 while the demands for processing large quantities of
                 data in real time are also increasing. However, the
                 pervasive presence of GPUs on mobile devices presents
                 great challenges for GPGPU, mainly because GPGPU
                 integrates a large amount of processor arrays and
                 concurrent executing threads (up to hundreds of
                 thousands). In particular, the root causes of
                 performance loss in a GPGPU program can not be revealed
                 in detail by current approaches. In this article, we
                 propose MiC (Multi-level Characterization), a framework
                 that comprehensively characterizes GPGPU kernels at the
                 instruction, Basic Block (BBL), and thread levels.
                 Specifically, we devise Instruction Vectors (IV) and
                 Basic Blocks Vectors (BBV), a Thread Similarity Matrix
                 (TSM), and a Divergence Flow Statistics Graph (DFSG) to
                 profile information in each level. We use MiC to
                 provide insights into GPGPU kernels through the
                 characterizations of 34 kernels from popular GPGPU
                 benchmark suites such as Compute Unified Device
                 Architecture (CUDA) Software Development Kit (SDK),
                 Rodinia, and Parboil. In comparison with Central
                 Processing Unit (CPU) workloads, we conclude the key
                 findings as follows: (1) There are comparable
                 Instruction-Level Parallelism (ILP); (2) The BBL count
                 is significantly smaller than CPU workloads-only 22.8
                 on average; (3) The dynamic instruction count per
                 thread varies from dozens to tens of thousands and it
                 is extremely small compared to CPU benchmarks; (4) The
                 Pareto principle (also called 90/10 rule) does not
                 apply to GPGPU kernels while it pervasively exists in
                 CPU programs; (5) The loop patterns are dramatically
                 different from those in CPU workloads; (6) The branch
                 ratio is lower than that of CPU programs but higher
                 than pure GPU workloads. In addition, we have also
                 shown how TSM and DFSG are used to characterize the
                 branch divergence in a visual way, to enable the
                 analysis of thread behavior in GPGPU programs. In
                 addition, we show an optimization case for a GPGPU
                 kernel from the bottleneck identified through its
                 characterization result, which improves 16.8\%
                 performance.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lopes:2019:FBD,
  author =       "Paulo A. C. Lopes and Satyendra Singh Yadav and
                 Aleksandar Ilic and Sarat Kumar Patra",
  title =        "Fast block distributed {CUDA} implementation of the
                 {Hungarian} algorithm",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "130",
  number =       "??",
  pages =        "50--62",
  month =        aug,
  year =         "2019",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2019.03.014",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon May 20 18:06:40 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519302254",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Lopez-Gomez:2019:ESP,
  author =       "Javier L{\'o}pez-G{\'o}mez and Javier Fern{\'a}ndez
                 Mu{\~n}oz and David del Rio Astorga and Manuel F. Dolz
                 and J. Daniel Garcia",
  title =        "Exploring stream parallel patterns in distributed
                 {MPI} environments",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "84",
  number =       "??",
  pages =        "24--36",
  month =        may,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303442",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Lorenzon:2019:ASO,
  author =       "A. F. Lorenzon and C. C. {de Oliveira} and J. D. Souza
                 and A. C. S. Beck",
  title =        "{Aurora}: Seamless Optimization of {OpenMP}
                 Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "5",
  pages =        "1007--1021",
  month =        may,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2872992",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "application program interfaces; Aurora; code
                 transformation; efficiency 91.0 percent; Energy-Delay
                 Product; Feedback-Driven Threading; Hardware;
                 Instruction sets; Ion radiation effects; Magnetosphere;
                 message passing; Message systems; Microarchitecture;
                 multicore processors; multiprocessing systems; OpenMP;
                 OpenMP applications; OpenMP feature; OpenMP framework;
                 optimization; parallel applications; parallel loop
                 region; parallel processing; Runtime; runtime
                 environments; seamless optimization; software
                 developers; standard OpenMP execution; thread-level
                 parallelism; Thread-level parallelism exploitation",
}

@Article{Losada:2019:LRR,
  author =       "Nuria Losada and George Bosilca and Aur{\'e}lien
                 Bouteiller and Patricia Gonz{\'a}lez and Mar{\'\i}a J.
                 Mart{\'\i}n",
  title =        "Local rollback for resilient {MPI} applications with
                 application-level checkpointing and message logging",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "91",
  number =       "??",
  pages =        "450--464",
  month =        feb,
  year =         "2019",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Tue Feb 5 08:15:51 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.sciencedirect.com/science/article/pii/S0167739X18303443",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Lu:2019:PMM,
  author =       "Gangzhao Lu and Weizhe Zhang and Hui He and Laurence
                 T. Yang",
  title =        "Performance modeling for {MPI} applications with low
                 overhead fine-grained profiling",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "90",
  number =       "??",
  pages =        "317--326",
  month =        jan,
  year =         "2019",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2018.08.018",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Tue Sep 18 14:07:59 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X18308252",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Mercan:2019:CCH,
  author =       "H. Mercan and C. Yilmaz and K. Kaya",
  title =        "{CHiP}: A Configurable Hybrid Parallel Covering Array
                 Constructor",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "45",
  number =       "12",
  pages =        "1270--1291",
  month =        dec,
  year =         "2019",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2018.2837759",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Dec 12 06:35:49 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
  keywords =     "Benchmark testing; constraint satisfaction problem;
                 Covering arrays; CUDA; Graphics processing units;
                 graphics processing units; metaheuristic search;
                 parallel computing; Parallel processing; Scalability;
                 Simulated annealing; Upper bound",
}

@Article{Mironov:2019:EMO,
  author =       "Vladimir Mironov and Alexander Moskovsky and Michael
                 D'Mello and Yuri Alexeev",
  title =        "An efficient {MPI\slash OpenMP} parallelization of the
                 {Hartree--Fock--Roothaan} method for the first
                 generation of {Intel{\reg} Xeon Phi{\TM}} processor
                 architecture",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "1",
  pages =        "212--224",
  day =          "1",
  month =        jan,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342017732628",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:53 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342017732628",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Oh:2019:HPT,
  author =       "S. Oh and N. Park and J. Jang and L. Sael and U.
                 Kang",
  title =        "High-Performance {Tucker} Factorization on
                 Heterogeneous Platforms",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "10",
  pages =        "2237--2248",
  month =        oct,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2908639",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Dec 19 09:20:35 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Computer science; factor matrices; GPGPU; graph
                 theory; Graphics processing units; GTA scales;
                 GTA-FULL; GTA-PART; heterogeneous computing;
                 Heterogeneous networks; heterogeneous platforms;
                 high-performance Tucker factorization; large-scale
                 multidimensional data; matrix decomposition; Memory
                 management; memory requirements; Motion pictures;
                 OpenCL; row-wise update rule; Scalability; Tensor
                 analysis; tensor factorization algorithms; tucker
                 factorization",
}

@Article{Ortega:2019:CAC,
  author =       "G. Ortega and E. M. T. Hendrix and I. Garc{\'\i}a",
  title =        "A {CUDA} approach to compute perishable inventory
                 control policies using value iteration",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "3",
  pages =        "1580--1593",
  month =        mar,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-018-2692-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:17 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/3;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/s11227-018-2692-z.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Otero:2019:OAA,
  author =       "Evelyn Otero and Jing Gong and Misun Min and Paul
                 Fischer and Philipp Schlatter and Erwin Laure",
  title =        "{OpenACC} acceleration for the {$ P_N $--$ P_{N - 2}
                 $} algorithm in {Nek5000}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "132",
  number =       "??",
  pages =        "69--78",
  month =        oct,
  year =         "2019",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2019.05.010",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Sep 13 10:25:20 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731518305549",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Park:2019:DBO,
  author =       "Sanghyun Park and Taeweon Suh",
  title =        "{DQN}-based {OpenCL} workload partition for
                 performance optimization",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "8",
  pages =        "4875--4893",
  month =        aug,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02766-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:21 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/8;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Perez:2019:ATO,
  author =       "B. P{\'e}rez and E. Stafford and J. L. Bosque and R.
                 Beivide and S. Mateo and X. Teruel and X. Martorell and
                 E. Ayguad{\'e}",
  title =        "Auto-tuned {OpenCL} kernel co-execution in {OmpSs} for
                 heterogeneous systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "125",
  number =       "??",
  pages =        "45--57",
  month =        mar,
  year =         "2019",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2018.11.001",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon Jan 7 07:58:40 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731518308189",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Pikle:2019:AFE,
  author =       "Nileshchandra K. Pikle and Shailesh R. Sathe and
                 Arvind Y. Vyavahare",
  title =        "Accelerating the finite element analysis of
                 functionally graded materials using fixed-grid strategy
                 on {CUDA}-enabled {GPUs}",
  journal =      j-CCPE,
  volume =       "31",
  number =       "17",
  pages =        "e5207:1--e5207:??",
  day =          "10",
  month =        sep,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5207",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Oct 12 11:00:05 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "03 April 2019",
}

@Article{Pirkelbauer:2019:BTF,
  author =       "Peter Pirkelbauer and Amalee Wilson and Christina
                 Peterson and Damian Dechev",
  title =        "{Blaze-Tasks}: a Framework for Computing Parallel
                 Reductions over Tasks",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "66:1--66:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293448",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compared to threads, tasks are a more fine-grained
                 alternative. The task parallel programming model offers
                 benefits in terms of better performance portability and
                 better load-balancing for problems that exhibit
                 nonuniform workloads. A common scenario of task
                 parallel programming is that a task is recursively
                 decomposed into smaller sub-tasks. Depending on the
                 problem domain, the number of created sub-tasks may be
                 nonuniform, thereby creating potential for significant
                 load imbalances in the system. Dynamic load-balancing
                 mechanisms will distribute the tasks across available
                 threads. The final result of a computation may be
                 modeled as a reduction over the results of all
                 sub-tasks. This article describes a simple, yet
                 effective prototype framework, Blaze-Tasks, for task
                 scheduling and task reductions on shared memory
                 architectures. The framework has been designed with
                 lock-free techniques and generic programming principles
                 in mind. Blaze-Tasks is implemented entirely in C++17
                 and is thus portable. To load-balance the computation,
                 Blaze-Tasks uses task stealing. To manage contention on
                 a task pool, the number of lock-free attempts to steal
                 a task depends on the distance between thief and pool
                 owner and the estimated number of tasks in a victim's
                 pool. This article evaluates the Blaze framework on
                 Intel and IBM dual-socket systems using nine benchmarks
                 and compares its performance with other task parallel
                 frameworks. While Cilk outperforms Blaze on Intel on
                 most benchmarks, the evaluation shows that Blaze is
                 competitive with OpenMP and other library-based
                 implementations. On IBM, the experiments show that
                 Blaze outperforms other approaches on most
                 benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "66",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Prades:2019:GJM,
  author =       "J. Prades and F. Silla",
  title =        "{GPU}-Job Migration: The {rCUDA} Case",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "12",
  pages =        "2718--2729",
  month =        dec,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2924433",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Dec 19 09:20:35 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "CUDA; GPU; Graphics processing units; Middleware;
                 migration; Proposals; rCUDA; Resource management;
                 Virtual machining; virtualization; Virtualization",
}

@Article{Reano:2019:APP,
  author =       "Carlos Rea{\~n}o and Javier Prades and Federico
                 Silla",
  title =        "Analyzing the performance\slash power tradeoff of the
                 {rCUDA} middleware for future exascale systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "132",
  number =       "??",
  pages =        "344--362",
  month =        oct,
  year =         "2019",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Sep 13 10:25:20 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519303491",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Reano:2019:SIN,
  author =       "Carlos Rea{\~n}o and Federico Silla",
  title =        "On the support of inter-node {P2P} {GPU} memory copies
                 in {rCUDA}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "127",
  number =       "??",
  pages =        "28--43",
  month =        may,
  year =         "2019",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2018.12.011",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 14 15:55:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519300255",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Riebler:2019:TAH,
  author =       "Heinrich Riebler and Gavin Vaz and Tobias Kenter and
                 Christian Plessl",
  title =        "Transparent Acceleration for Heterogeneous Platforms
                 With Compilation to {OpenCL}",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3319423",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multi-accelerator platforms combine CPUs and different
                 accelerator architectures within a single compute node.
                 Such systems are capable of processing parallel
                 workloads very efficiently while being more energy
                 efficient than regular systems consisting of CPUs only.
                 However, the architectures of such systems are diverse,
                 forcing developers to port applications to each
                 accelerator using different programming languages,
                 models, tools, and compilers. Developers not only
                 require domain-specific knowledge but also need to
                 understand the low-level accelerator details, leading
                 to an increase in the design effort and costs. To
                 tackle this challenge, we propose a compilation
                 approach and a practical realization called HTrOP that
                 is completely transparent to the user. HTrOP is able to
                 automatically analyze a sequential CPU application,
                 detect computational hotspots, and generate parallel
                 OpenCL host and kernel code. The potential of HTrOP is
                 demonstrated by offloading hotspots to different
                 OpenCL-enabled resources (currently the CPU, the
                 general-purpose GPU, and the manycore Intel Xeon Phi)
                 for a broad set of benchmark applications. We present
                 an in-depth evaluation of our approach in terms of
                 performance gains and energy savings, taking into
                 account all static and dynamic overheads. We are able
                 to achieve speedups and energy savings of up to two
                 orders of magnitude, if an application has sufficient
                 computational intensity, when compared to a natively
                 compiled application.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Roth:2019:AOC,
  author =       "{\'A}goston R{\'o}th",
  title =        "Algorithm 992: An {OpenGL}- and {C++}-based Function
                 Library for Curve and Surface Modeling in a Large Class
                 of Extended {Chebyshev} Spaces",
  journal =      j-TOMS,
  volume =       "45",
  number =       "1",
  pages =        "13:1--13:32",
  month =        mar,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3284979",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon May 6 18:23:42 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3284979",
  abstract =     "We propose a platform-independent multi-threaded
                 function library that provides data structures to
                 generate, differentiate, and render both the ordinary
                 basis and the normalized B-basis of a user-specified
                 extended Chebyshev (EC) space that comprises the
                 constants and can be identified with the solution space
                 of a constant-coefficient homogeneous linear
                 differential equation defined on a sufficiently small
                 interval. Using the obtained normalized B-bases, our
                 library can also generate, (partially) differentiate,
                 modify, and visualize a large family of so-called
                 B-curves and tensor product B-surfaces. Moreover, the
                 library also implements methods that can be used to
                 perform dimension elevation, to subdivide B-curves and
                 B-surfaces by means of de Casteljau-like B-algorithms,
                 and to generate basis transformations for the
                 B-representation of arbitrary integral curves and
                 surfaces that are described in traditional parametric
                 form by means of the ordinary bases of the underlying
                 EC spaces. Independently of the algebraic, exponential,
                 trigonometric, or mixed type of the applied EC space,
                 the proposed library is numerically stable and
                 efficient up to a reasonable dimension number and may
                 be useful for academics and engineers in the fields of
                 Approximation Theory, Computer Aided Geometric Design,
                 Computer Graphics, and Isogeometric and Numerical
                 Analysis.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Ruhela:2019:EDM,
  author =       "Amit Ruhela and Hari Subramoni and Sourav Chakraborty
                 and Mohammadreza Bayatpour and Pouya Kousha and
                 Dhabaleswar K. (DK) Panda",
  title =        "Efficient design for {MPI} asynchronous progress
                 without dedicated resources",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "13--26",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2019.03.003",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303302",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Sala:2019:IBN,
  author =       "Kevin Sala and Xavier Teruel and Josep M. Perez and
                 Antonio J. Pe{\~n}a and Vicen{\c{c}} Beltran and Jesus
                 Labarta",
  title =        "Integrating blocking and non-blocking {MPI} primitives
                 with task-based programming models",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "85",
  number =       "??",
  pages =        "153--166",
  month =        jul,
  year =         "2019",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.12.008",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303326",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Schardl:2019:TER,
  author =       "Tao B. Schardl and William S. Moses and Charles E.
                 Leiserson",
  title =        "{Tapir}: Embedding Recursive Fork-join Parallelism
                 into {LLVM}'s Intermediate Representation",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365655",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3365655",
  abstract =     "Tapir (pronounced TAY-per) is a compiler intermediate
                 representation (IR) that embeds recursive fork-join
                 parallelism, as supported by task-parallel programming
                 platforms such as Cilk and OpenMP, into a mainstream
                 compiler's IR. Mainstream compilers typically treat
                 parallel linguistic constructs as syntactic sugar for
                 function calls into a parallel runtime. These calls
                 prevent the compiler from performing optimizations on
                 and across parallel control constructs. Remedying this
                 situation has generally been thought to require an
                 extensive reworking of compiler analyses and code
                 transformations to handle parallel semantics. Tapir
                 leverages the ``serial-projection property,'' which is
                 commonly satisfied by task-parallel programs, to handle
                 the semantics of these programs without an extensive
                 rework of the compiler. For recursive fork-join
                 programs that satisfy the serial-projection property,
                 Tapir enables effective compiler optimization of
                 parallel programs with only minor changes to existing
                 compiler analyses and code transformations. Tapir uses
                 the serial-projection property to order logically
                 parallel fine-grained tasks in the program's
                 control-flow graph. This ordered representation of
                 parallel tasks allows the compiler to optimize parallel
                 codes effectively with only minor modifications. For
                 example, to implement Tapir/LLVM, a prototype of Tapir
                 in the LLVM compiler, we added or modified less than
                 3,000 lines of LLVM's half-million-line core middle-end
                 functionality. These changes sufficed to enable LLVM's
                 existing compiler optimizations for serial
                 code-including loop-invariant-code motion,
                 common-subexpression elimination, and tail-recursion
                 elimination-to work with parallel control constructs
                 such as parallel loops and Cilk's Cilk_Spawn keyword.
                 Tapir also supports parallel optimizations, such as
                 loop scheduling, which restructure the parallel control
                 flow of the program. By making use of existing LLVM
                 optimizations and new parallel optimizations,
                 Tapir/LLVM can optimize recursive fork-join programs
                 more effectively than traditional compilation methods.
                 On a suite of 35 Cilk application benchmarks,
                 Tapir/LLVM produces more efficient executables for 30
                 benchmarks, with faster 18-core running times for 26 of
                 them, compared to a nearly identical compiler that
                 compiles parallel linguistic constructs the traditional
                 way.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Searles:2019:MOA,
  author =       "Robert Searles and Sunita Chandrasekaran and Wayne
                 Joubert and Oscar Hernandez",
  title =        "{MPI + OpenACC}: Accelerating radiation transport
                 mini-application, minisweep, on heterogeneous systems",
  journal =      j-COMP-PHYS-COMM,
  volume =       "236",
  number =       "??",
  pages =        "176--187",
  month =        mar,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.10.007",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Jan 28 16:49:58 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518303552",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Sharif:2019:APC,
  author =       "Hashim Sharif and Prakalp Srivastava and Muhammad
                 Huzaifa and Maria Kotsifakou and Keyur Joshi and Yasmin
                 Sarita and Nathan Zhao and Vikram S. Adve and Sasa
                 Misailovic and Sarita Adve",
  title =        "{ApproxHPVM}: a portable compiler {IR} for
                 accuracy-aware optimizations",
  journal =      j-PACMPL,
  volume =       "3",
  number =       "OOPSLA",
  pages =        "186:1--186:30",
  month =        oct,
  year =         "2019",
  DOI =          "https://doi.org/10.1145/3360612",
  bibdate =      "Fri Aug 7 19:22:30 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360612",
  abstract =     "We propose ApproxHPVM, a compiler IR and system
                 designed to enable accuracy-aware performance and
                 energy tuning on heterogeneous systems with multiple
                 compute units and approximation methods. ApproxHPVM
                 automatically translates end-to-end
                 application-portability across heterogeneous hardware
                 platforms and enables future capabilities like
                 accuracy-aware dynamic scheduling and design space
                 exploration.\par

                 ApproxHPVM incorporates three main components: (a) a
                 compiler IR with hardware-agnostic approximation
                 metrics, (b) a hardware-agnostic accuracy-tuning phase
                 to identify error-tolerant computations, and (c) an
                 accuracy-aware hardware scheduler that maps
                 error-tolerant computations to approximate hardware
                 components. As ApproxHPVM does not incorporate any
                 hardware-specific knowledge as part of the IR, it can
                 serve as a portable virtual ISA that can be shipped to
                 all kinds of hardware platforms.\par

                 We evaluate our framework on nine benchmarks from the
                 deep learning domain and five image processing
                 benchmarks. Our results show that our framework can
                 offload chunks of approximable computations to
                 special-purpose accelerators that provide significant
                 gains in performance and energy, while staying within
                 user-specified application-level quality metrics with
                 high probability. Across the 14 benchmarks, we observe
                 from $1$--$ 9 \times $ performance speedups and $
                 1.1$--$ 11.3 \times $ energy reduction for very small
                 reductions in accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "186",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Shea:2019:HSD,
  author =       "Colin Shea and Tinoosh Mohsenin",
  title =        "Heterogeneous Scheduling of Deep Neural Networks for
                 Low-power Real-time Designs",
  journal =      j-JETC,
  volume =       "15",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358699",
  ISSN =         "1550-4832",
  bibdate =      "Tue Dec 17 07:50:24 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3358699",
  abstract =     "Deep neural networks have become the readiest answer
                 to a range of application challenges including image
                 recognition, stock analysis, natural language
                 processing, and biomedical applications such as seizure
                 detection. All while outperforming prior leading
                 solutions that relied heavily on hand-engineered
                 techniques. However, deployment of these neural
                 networks often requires high-computational and
                 memory-intensive solutions. These requirements make it
                 challenging to deploy Deep Neural Networks (DNNs) in
                 embedded, real-time low-power applications where
                 classic architectures, GPUs and CPUs, still impose
                 significant power burden. Systems-on-Chip (SoC) with
                 Field-programmable Gate Arrays (FPGAs) can be used to
                 improve performance and allow more fine-grain control
                 of resources than CPUs or GPUs, but it is difficult to
                 find the optimal balance between hardware and software
                 to improve DNN efficiency. In the current research
                 literature there have been few proposed solutions to
                 address optimizing hardware and software deployments of
                 DNNs in embedded low-power systems. To address the
                 computation resource restriction and low-power needs
                 for deploying these networks, we describe and implement
                 a domain-specific metric model for optimizing task
                 deployment on differing platforms, hardware and
                 software. Next, we propose a DNN hardware accelerator
                 called Scalable Low-power Accelerator for real-time
                 deep neural Networks (SCALENet) that includes
                 multithreaded software workers. Finally, we propose a
                 heterogeneous aware scheduler that uses the
                 DNN-specific metric models and the SCALENet accelerator
                 to allocate a task to a resource based on solving a
                 numerical cost for a series of domain objectives. To
                 demonstrate the applicability of our contribution, we
                 deploy nine modern deep network architectures, each
                 containing a different number of parameters within the
                 context of two different neural network applications:
                 image processing and biomedical seizure detection.
                 Utilizing the metric modeling techniques integrated
                 into the heterogeneous aware scheduler and the SCALENet
                 accelerator, we demonstrate the ability to meet
                 computational requirements, adapt to multiple
                 architectures, and lower power by providing an
                 optimized task to resource allocation. Our
                 heterogeneous aware scheduler improves power saving by
                 decreasing power consumption by 10\% of the total
                 system power, does not affect the accuracy of the
                 networks, and still meets the real-time deadlines. We
                 demonstrate the ability to achieve parity with or
                 exceed the energy efficiency of NVIDIA GPUs when
                 evaluated against Jetson TK1 with embedded GPU SoC and
                 with a 4$ \times $ power savings in a power envelope of
                 2.0W. When compared to existing FPGA-based
                 accelerators, SCALENet's accelerator and heterogeneous
                 aware scheduler achieves a 4$ \times $ improvement in
                 energy efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Shekofteh:2019:MSG,
  author =       "S.-Kazem Shekofteh and Hamid Noori and Mahmoud
                 Naghibzadeh and Hadi Sadoghi Yazdi and Holger
                 Fr{\"o}ning",
  title =        "Metric Selection for {GPU} Kernel Classification",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "68:1--68:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3295690",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics Processing Units (GPUs) are vastly used for
                 running massively parallel programs. GPU kernels
                 exhibit different behavior at runtime and can usually
                 be classified in a simple form as either
                 ``compute-bound'' or ``memory-bound.'' Recent GPUs are
                 capable of concurrently running multiple kernels, which
                 raises the question of how to most appropriately
                 schedule kernels to achieve higher performance. In
                 particular, co-scheduling of compute-bound and
                 memory-bound kernels seems promising. However, its
                 benefits as well as drawbacks must be determined along
                 with which kernels should be selected for a concurrent
                 execution. Classifying kernels can be performed online
                 by instrumentation based on performance counters. This
                 work conducts a thorough analysis of the metrics
                 collected from various benchmarks from Rodinia and CUDA
                 SDK. The goal is to find the minimum number of
                 effective metrics that enables online classification of
                 kernels with a low overhead. This study employs a
                 wrapper-based feature selection method based on the
                 Fisher feature selection criterion. The results of
                 experiments show that to classify kernels with a high
                 accuracy, only three and five metrics are sufficient on
                 a Kepler and a Pascal GPU, respectively. The proposed
                 method is then utilized for a runtime scheduler. The
                 results show an average speedup of 1.18$ \times $ and
                 1.1$ \times $ compared with a serial and a random
                 scheduler, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "68",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shterenlikht:2019:MVF,
  author =       "Anton Shterenlikht and Luis Cebamanos",
  title =        "{MPI} vs {Fortran} coarrays beyond 100k cores: {$3$D}
                 cellular automata",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "84",
  number =       "??",
  pages =        "37--49",
  month =        may,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303181",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Simmendinger:2019:ISG,
  author =       "Christian Simmendinger and Roman Iakymchuk and Luis
                 Cebamanos and Dana Akhmetova and Valeria Bartsch and
                 Tiberiu Rotaru and Mirko Rahn and Erwin Laure and
                 Stefano Markidis",
  title =        "Interoperability strategies for {GASPI} and {MPI} in
                 large-scale scientific applications",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "3",
  pages =        "554--568",
  day =          "1",
  month =        may,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342018808359",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:53 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342018808359",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Song:2019:PGA,
  author =       "You Song and Siyu Yang and Jinzhi Lei",
  title =        "{ParaCells}: a {GPU} Architecture for Cell-Centered
                 Models in Computational Biology",
  journal =      j-TCBB,
  volume =       "16",
  number =       "3",
  pages =        "994--1006",
  month =        may,
  year =         "2019",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2018.2814570",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Aug 23 11:22:19 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "In computational biology, the hierarchy of biological
                 systems requires the development of flexible and
                 powerful computational tools. Graphics processing unit
                 GPU architecture has been a suitable device for
                 parallel computing in simulating multi-cellular
                 systems. However, in modeling complex biological
                 systems, scientists often face two tasks, mathematical
                 formulation and skillful programming. In particular,
                 specific programming skills are needed for GPU
                 programming. Therefore, the development of an
                 easy-to-use computational architecture, which utilizes
                 GPU for parallel computing and provides intuitive
                 interfaces for simple implementation, is needed so that
                 general scientists can perform GPU simulations without
                 knowing much about the GPU architecture. Here, we
                 introduce ParaCells, a cell-centered GPU simulation
                 architecture for NVIDIA compute unified device
                 architecture CUDA. ParaCells was designed as a
                 versatile architecture that connects the user logic in
                 C++ with NVIDIA CUDA runtime and is specific to the
                 modeling of multi-cellular systems. An advantage of
                 ParaCells is its object-oriented model declaration,
                 which allows it to be widely applied to many biological
                 systems through the combination of basic biological
                 concepts. We test ParaCells with two applications. Both
                 applications are significantly faster when compared
                 with sequential as well as parallel OpenMP and OpenACC
                 implementations. Moreover, the simulation programs
                 based on ParaCells are cleaner and more readable than
                 other versions.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Speck:2019:APP,
  author =       "Robert Speck",
  title =        "{Algorithm 997}: {pySDC}-Prototyping Spectral Deferred
                 Corrections",
  journal =      j-TOMS,
  volume =       "45",
  number =       "3",
  pages =        "35:1--35:23",
  month =        aug,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3310410",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Sep 3 17:49:22 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3310410",
  abstract =     "In this article, we present the Python framework pySDC
                 for solving collocation problems with spectral deferred
                 correction (SDC) methods and their time-parallel
                 variant PFASST, the parallel full approximation scheme
                 in space and time. pySDC features many implementations
                 of SDC and PFASST, from simple implicit timestepping to
                 high-order implicit-explicit or multi-implicit
                 splitting and multilevel SDCs. The software package
                 comes with many different, preimplemented examples and
                 has seven tutorials to help new users with their first
                 steps. Time parallelism is implemented either in an
                 emulated way for debugging and prototyping or using MPI
                 for benchmarking. The code is fully documented and
                 tested using continuous integration, including most
                 results of previous publications. Here, we describe the
                 structure of the code by taking two different
                 perspectives: those of the user and those of the
                 developer. The first sheds light on the front-end, the
                 examples, and the tutorials, and the second is used to
                 describe the underlying implementation and the data
                 structures. We show three different examples to
                 highlight various aspects of the implementation, the
                 capabilities, and the usage of pySDC. In addition,
                 couplings to the FEniCS framework and PETSc, the latter
                 including spatial parallelism with MPI, are
                 described.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{St-Onge:2019:ESS,
  author =       "Guillaume St-Onge and Jean-Gabriel Young and Laurent
                 H{\'e}bert-Dufresne and Louis J. Dub{\'e}",
  title =        "Efficient sampling of spreading processes on complex
                 networks using a composition and rejection algorithm",
  journal =      j-COMP-PHYS-COMM,
  volume =       "240",
  number =       "??",
  pages =        "30--37",
  month =        jul,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2019.02.008",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jun 14 08:12:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465519300608",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Sultana:2019:FRB,
  author =       "Nawrin Sultana and Martin R{\"u}fenacht and Anthony
                 Skjellum and Ignacio Laguna and Kathryn Mohror",
  title =        "Failure recovery for bulk synchronous applications
                 with {MPI} stages",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "84",
  number =       "??",
  pages =        "1--14",
  month =        may,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118303260",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Tang:2019:MNT,
  author =       "Yibin Tang and Ying Wang and Huawei Li and Xiaowei
                 Li",
  title =        "{MV-Net}: Toward Real-Time Deep Learning on Mobile
                 {GPGPU} Systems",
  journal =      j-JETC,
  volume =       "15",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358696",
  ISSN =         "1550-4832",
  bibdate =      "Tue Dec 17 07:50:24 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3358696",
  abstract =     "Recently the development of deep learning has been
                 propelling the sheer growth of vision and speech
                 applications on lightweight embedded and mobile
                 systems. However, the limitation of computation
                 resource and power delivery capability in embedded
                 platforms is recognized as a significant bottleneck
                 that prevents the systems from providing real-time deep
                 learning ability, since the inference of deep
                 convolutional neural networks (CNNs) and recurrent
                 neural networks (RNNs) involves large quantities of
                 weights and operations. Particularly, how to provide
                 quality-of-services (QoS)-guaranteed neural network
                 inference ability in the multitask execution
                 environment of multicore SoCs is even more complicated
                 due to the existence of resource contention. In this
                 article, we present a novel deep neural network
                 architecture, MV-Net, which provides performance
                 elasticity and contention-aware self-scheduling ability
                 for QoS enhancement in mobile computing systems. When
                 the constraints of QoS, output accuracy, and resource
                 contention status of the system change, MV-Net can
                 dynamically reconfigure the corresponding neural
                 network propagation paths and thus achieves an
                 effective tradeoff between neural network computational
                 complexity and prediction accuracy via approximate
                 computing. The experimental results show that (1)
                 MV-Net significantly improves the performance
                 flexibility of current CNN models and makes it possible
                 to provide always-guaranteed QoS in a multitask
                 environment, and (2) it satisfies the
                 quality-of-results (QoR) requirement, outperforming the
                 baseline implementation significantly, and improves the
                 system energy efficiency at the same time.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tang:2019:QDL,
  author =       "Xulong Tang and Ashutosh Pattnaik and Onur Kayiran and
                 Adwait Jog and Mahmut Taylan Kandemir and Chita Das",
  title =        "Quantifying Data Locality in Dynamic Parallelism in
                 {GPUs}",
  journal =      j-SIGMETRICS,
  volume =       "47",
  number =       "1",
  pages =        "25--26",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3376930.3376947",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Mon Jan 27 06:15:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3376930.3376947",
  abstract =     "Dynamic parallelism (DP) is a new feature of emerging
                 GPUs that allows new kernels to be generated and
                 scheduled from the device-side (GPU) without the
                 host-side (CPU) intervention. To efficiently support
                 DP, one of the major challenges is to saturate the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "https://dl.acm.org/loi/sigmetrics",
}

@Article{Teijeiro:2019:OPS,
  author =       "Carlos Teijeiro and Thomas Hammerschmidt and Ralf
                 Drautz and Godehard Sutmann",
  title =        "Optimized parallel simulations of analytic bond-order
                 potentials on hybrid shared\slash distributed memory
                 with {MPI} and {OpenMP}",
  journal =      j-IJHPCA,
  volume =       "33",
  number =       "2",
  pages =        "227--241",
  day =          "1",
  month =        mar,
  year =         "2019",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342017727060",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Wed Oct 9 14:35:53 MDT 2019",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342017727060",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Teunissen:2019:GML,
  author =       "J. Teunissen and R. Keppens",
  title =        "A geometric multigrid library for quadtree\slash
                 octree {AMR} grids coupled to {MPI-AMRVAC}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "245",
  number =       "??",
  pages =        "Article 106866",
  month =        dec,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2019.106866",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Tue Oct 29 11:44:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S001046551930253X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Tian:2019:GAB,
  author =       "Tian Tian and Dunwei Gong and Fei-Ching Kuo and Huai
                 Liu",
  title =        "Genetic algorithm based test data generation for {MPI}
                 parallel programs with blocking communication",
  journal =      j-J-SYST-SOFTW,
  volume =       "155",
  number =       "??",
  pages =        "130--144",
  month =        sep,
  year =         "2019",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Wed Oct 16 06:54:20 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0164121219300810",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Systems and Software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212",
}

@Article{Tu:2019:AOS,
  author =       "Chia-Heng Tu and Te-Sheng Lin",
  title =        "Augmenting Operating Systems with {OpenCL}
                 Accelerators",
  journal =      j-TODAES,
  volume =       "24",
  number =       "3",
  pages =        "30:1--30:29",
  month =        jun,
  year =         "2019",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/3315569",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Thu Jan 30 09:00:30 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/todaes.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3315569",
  abstract =     "Heterogeneous computing leverages more than one kind
                 of processors to boost the performance of user-space
                 applications with the heterogeneous programming
                 languages, e.g., OpenCL. While some works have been
                 done to accelerate the computations required by
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "https://dl.acm.org/loi/todaes",
}

@Article{Utterback:2019:POR,
  author =       "Robert Utterback and Kunal Agrawal and I-Ting Angelina
                 Lee and Milind Kulkarni",
  title =        "Processor-Oblivious Record and Replay",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365659",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3365659",
  abstract =     "Record-and-replay systems are useful tools for
                 debugging non-deterministic parallel programs by first
                 recording an execution and then replaying that
                 execution to produce the same access pattern. Existing
                 record-and-replay systems generally target thread-based
                 execution models, and record the behaviors and
                 interleavings of individual threads. Dynamic
                 multithreaded languages and libraries, such as the Cilk
                 family, OpenMP, TBB, and the like, do not have a notion
                 of threads. Instead, these languages provide a
                 processor-oblivious model of programming, where
                 programs expose task parallelism using high-level
                 constructs such as spawn/sync without regard to the
                 number of threads/cores available to run the program.
                 Thread-based record-and-replay would violate the
                 processor-oblivious nature of these programs, as they
                 incorporate the number of threads into the recorded
                 information, constraining the replayed execution to the
                 same number of threads. In this article, we present a
                 processor-oblivious record-and-replay scheme for
                 dynamic multithreaded languages where record and replay
                 can use different number of processors and both are
                 scheduled using work stealing. We provide theoretical
                 guarantees for our record and replay scheme-namely that
                 record is optimal for programs with one lock and replay
                 is near-optimal for all cases. In addition, we
                 implemented this scheme in the Cilk Plus runtime system
                 and our evaluation indicates that
                 processor-obliviousness does not cause substantial
                 overheads.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Valero-Lara:2019:MTS,
  author =       "Pedro Valero-Lara and Ra{\"u}l Sirvent and Antonio J.
                 Pe{\~n}a and Jes{\'u}s Labarta",
  title =        "{MPI + OpenMP} tasking scalability for
                 multi-morphology simulations of the human brain",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "84",
  number =       "??",
  pages =        "50--61",
  month =        may,
  year =         "2019",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Oct 14 16:20:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S016781911830317X",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Vasilache:2019:NAL,
  author =       "Nicolas Vasilache and Oleksandr Zinenko and Theodoros
                 Theodoridis and Priya Goyal and Zachary Devito and
                 William S. Moses and Sven Verdoolaege and Andrew Adams
                 and Albert Cohen",
  title =        "The Next 700 Accelerated Layers: From Mathematical
                 Expressions of Network Computation Graphs to
                 Accelerated {GPU} Kernels, Automatically",
  journal =      j-TACO,
  volume =       "16",
  number =       "4",
  pages =        "38:1--38:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355606",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 12 15:31:26 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Deep learning frameworks automate the deployment,
                 distribution, synchronization, memory allocation, and
                 hardware acceleration of models represented as graphs
                 of computational operators. These operators wrap
                 high-performance libraries such as cuDNN or NNPACK.
                 When the computation does not match any predefined
                 library call, custom operators must be implemented,
                 often at high engineering cost and performance penalty,
                 limiting the pace of innovation. To address this
                 productivity gap, we propose and evaluate: (1) a
                 domain-specific language with a tensor notation close
                 to the mathematics of deep learning; (2) a Just-In-Time
                 optimizing compiler based on the polyhedral framework;
                 (3) carefully coordinated linear optimization and
                 evolutionary algorithms to synthesize high-performance
                 CUDA kernels; (4) the transparent integration of our
                 flow into PyTorch and Caffe2, providing the fully
                 automatic synthesis of high-performance GPU kernels
                 from simple tensor algebra. The performance is
                 comparable to, and often exceeds the performance of,
                 highly tuned libraries.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vitali:2019:EOO,
  author =       "Emanuele Vitali and Davide Gadioli and Gianluca
                 Palermo and Andrea Beccari and Carlo Cavazzoni and
                 Cristina Silvano",
  title =        "Exploiting {OpenMP} and {OpenACC} to accelerate a
                 geometric approach to molecular docking in
                 heterogeneous {HPC} nodes",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "7",
  pages =        "3374--3396",
  month =        jul,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02875-w",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:20 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/7;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Vu:2019:FMT,
  author =       "V. A. Vu and G. Tan",
  title =        "A Framework for Mesoscopic Traffic Simulation in
                 {GPU}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "8",
  pages =        "1691--1703",
  month =        aug,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2896636",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Computational modeling; Data models; data parallelism;
                 data structures; demand and supply components; GPU; GPU
                 threads; graphics processing units; Graphics processing
                 units; high-performance computing; innovative data
                 structure; Load modeling; Loading; mesoscopic traffic
                 simulation; Microscopy; optimisation; optimization;
                 road traffic; simulation algorithm; simulation flow;
                 traffic engineering computing; traffic management
                 support capabilities; traffic network; Vehicles",
}

@Article{Waidyasooriya:2019:OBD,
  author =       "Hasitha Muthumala Waidyasooriya and Masanori Hariyama
                 and Masamichi J. Miyama and Masayuki Ohzeki",
  title =        "{OpenCL}-based design of an {FPGA} accelerator for
                 quantum annealing simulation",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "8",
  pages =        "5019--5039",
  month =        aug,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02778-w",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:21 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/8;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Wang:2019:FBA,
  author =       "Haomiao Wang and Prabu Thiagaraj and Oliver Sinnen",
  title =        "{FPGA}-based Acceleration of {FT} Convolution for
                 Pulsar Search Using {OpenCL}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3268933",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3268933",
  abstract =     "The Square Kilometre Array (SKA) project will be the
                 world's largest radio telescope array. With its large
                 number of antennas, the number of signals that need to
                 be processed is dramatic. One important element of the
                 SKA's Central Signal Processor package is pulsar
                 search. This article focuses on the FPGA-based
                 acceleration of the Frequency-Domain Acceleration
                 Search module, which is a part of SKA pulsar search
                 engine. In this module, the frequency-domain input
                 signals have to be processed by 85 Finite Impulse
                 response (FIR) filters within a short period of
                 limitation and for thousands of input arrays. Because
                 of the large scale of the input length and FIR filter
                 size, even high-end FPGA devices cannot parallelise the
                 task completely. We start by investigating both
                 time-domain FIR filter (TDFIR) and frequency-domain FIR
                 filter (FDFIR) to tackle this task. We applied the
                 overlap-add algorithm to split the coefficient array of
                 TDFIR and the overlap-save algorithm to split the input
                 signals of FDFIR. To achieve fast prototyping design,
                 we employed OpenCL, which is a high-level FPGA
                 development technique. The performance and power
                 consumption are evaluated using multiple FPGA devices
                 simultaneously and compared with GPU results, which is
                 achieved by porting FPGA-based OpenCL kernels. The
                 experimental evaluation shows that the FDFIR solution
                 is very competitive in terms of performance, with a
                 clear energy consumption advantage over the GPU
                 solution.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Wang:2019:MEM,
  author =       "L. Wang and M. Jahre and A. Adileh and Z. Wang and L.
                 Eeckhout",
  title =        "Modeling Emerging Memory-Divergent {GPU}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "95--98",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2923618",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Analytical performance models yield valuable
                 architectural insight without incurring the excessive
                 runtime overheads of simulation. In this work, we study
                 contemporary GPU applications and find that the key
                 performance-related behavior of such applications is
                 distinct from traditional GPU applications. The key
                 issue is that these GPU applications are
                 memory-intensive and have poor spatial locality, which
                 implies that the loads of different threads commonly
                 access different cache blocks. Such memory-divergent
                 applications quickly exhaust the number of misses the
                 L1 cache can process concurrently, and thereby cripple
                 the GPU's ability to use Memory-Level Parallelism (MLP)
                 and Thread-Level Parallelism (TLP) to hide memory
                 latencies. Our Memory Divergence Model (MDM) is able to
                 accurately represent this behavior and thereby reduces
                 average performance prediction error by $ 14 \times $
                 compared to the state-of-the-art GPUMech approach
                 across our memory-divergent applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; analytical performance models;
                 Analytical performance prediction; average performance
                 prediction error; cache blocks; cache storage;
                 Computational modeling; contemporary GPU applications;
                 GPU; graphics processing units; Graphics processing
                 units; Instruction sets; key performance-related
                 behavior; L1 cache; Mathematical model; memory
                 architecture; memory divergence model; memory
                 latencies; memory-divergent applications;
                 memory-divergent GPU applications; memory-intensive;
                 memory-level parallelism; multi-threading;
                 multiprocessing systems; Predictive models; Random
                 access memory; thread-level parallelism; traditional
                 GPU applications; valuable architectural insight",
}

@Article{Warren:2019:CBG,
  author =       "Craig Warren and Antonios Giannopoulos and Alan Gray
                 and Iraklis Giannakis and Alan Patterson and Laura
                 Wetter and Andre Hamrah",
  title =        "A {CUDA}-based {GPU} engine for {gprMax}: Open source
                 {FDTD} electromagnetic simulation software",
  journal =      j-COMP-PHYS-COMM,
  volume =       "237",
  number =       "??",
  pages =        "208--218",
  month =        apr,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.11.007",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Feb 6 15:16:58 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518303990",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Wende:2019:OVT,
  author =       "Florian Wende and Martijn Marsman and Jeongnim Kim and
                 Fedor Vasilev and Zhengji Zhao and Thomas Steinke",
  title =        "{OpenMP} in {VASP}: Threading and {SIMD}",
  journal =      j-IJQC,
  volume =       "119",
  number =       "12",
  pages =        "e25851:1--e25851:??",
  day =          "15",
  month =        jun,
  year =         "2019",
  CODEN =        "IJQCB2",
  DOI =          "https://doi.org/10.1002/qua.25851",
  ISSN =         "0020-7608 (print), 1097-461X (electronic)",
  ISSN-L =       "0020-7608",
  bibdate =      "Wed Oct 9 06:14:07 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ijqc2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Quantum Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0020-7608/",
  onlinedate =   "19 December 2018",
}

@Article{Winkler:2019:GSM,
  author =       "Daniel Winkler and Massoud Rezavand and Michael
                 Meister and Wolfgang Rauch",
  title =        "{gpuSPHASE} --- a shared memory caching implementation
                 for {$2$D} {SPH} using {CUDA} (new version
                 announcement)",
  journal =      j-COMP-PHYS-COMM,
  volume =       "235",
  number =       "??",
  pages =        "514--516",
  month =        feb,
  year =         "2019",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2018.08.016",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Nov 24 07:45:46 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465518303126",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Wozniak:2019:MJW,
  author =       "Justin M. Wozniak and Matthieu Dorier and Robert Ross
                 and Tong Shu and Tahsin Kurc and Li Tang and Norbert
                 Podhorszki and Matthew Wolf",
  title =        "{MPI} jobs within {MPI} jobs: a practical way of
                 enabling task-level fault-tolerance in {HPC}
                 workflows",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "101",
  number =       "??",
  pages =        "576--589",
  month =        dec,
  year =         "2019",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.05.020",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Feb 10 12:55:02 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X1830757X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Wu:2019:PMG,
  author =       "J. Wu and X. Yang and Z. Zhang and G. Chen and R.
                 Mao",
  title =        "A Performance Model for {GPU} Architectures that
                 Considers On-Chip Resources: Application to Medical
                 Image Registration",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "9",
  pages =        "1947--1961",
  month =        sep,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2905213",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Computational modeling; Computer architecture; CPU;
                 data transfer; Data transfer; GPU architectures;
                 graphics processing unit; graphics processing units;
                 Graphics processing units; graphics processing units;
                 image registration; Image registration; medical image
                 processing; medical image registration; NVIDIA GPUs;
                 on-chip GPU resources; on-chip resources; parallel
                 programming; parallel programs; Performance model;
                 performance model; Predictive models; System-on-chip",
}

@Article{Yeh:2019:PGR,
  author =       "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
                 Rudolf Eigenmann and Timothy G. Rogers",
  title =        "{Pagoda}: a {GPU} Runtime System for Narrow Tasks",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365657",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Massively multithreaded GPUs achieve high throughput
                 by running thousands of threads in parallel. To fully
                 utilize the their hardware, contemporary workloads
                 spawn work to the GPU in bulk by launching large tasks,
                 where each task is a kernel that contains thousands of
                 threads that occupy the entire GPU. GPUs face severe
                 underutilization and their performance benefits vanish
                 if the tasks are narrow, i.e., they contain less than
                 512 threads. Latency-sensitive applications in network,
                 signal, and image processing that generate a large
                 number of tasks with relatively small inputs are
                 examples of such limited parallelism. This article
                 presents Pagoda, a runtime system that virtualizes GPU
                 resources, using an OS-like daemon kernel called
                 MasterKernel. Tasks are spawned from the CPU onto
                 Pagoda as they become available, and are scheduled by
                 the MasterKernel at the warp granularity. This level of
                 control enables the GPU to keep scheduling and
                 executing tasks as long as free warps are found,
                 dramatically reducing underutilization. Experimental
                 results on real hardware demonstrate that Pagoda
                 achieves a geometric mean speedup of 5.52X over
                 PThreads running on a 20-core CPU, 1.76X over
                 CUDA-HyperQ, and 1.44X over GeMTC, the state-of-the-art
                 runtime GPU task scheduling system.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Zaitsev:2019:SLD,
  author =       "D. Zaitsev and S. Tomov and J. Dongarra",
  title =        "Solving Linear {Diophantine} Systems on Parallel
                 Architectures",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "5",
  pages =        "1158--1169",
  month =        may,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2873354",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "application program interfaces; clan; discrete system
                 modeling; discrete-event systems; distributed memory
                 systems; distributed-memory computing nodes;
                 distributing systems; dynamic task-dispatching
                 subsystem; formal languages; linear Diophantine system;
                 linear Diophantine systems-of-equations; logic
                 programming; Mathematical model; mathematics computing;
                 Matrix decomposition; message passing; model checking;
                 MPI; multiple cores; nonnegative integer numbers;
                 OpenMP; parallel architectures; Parallel architectures;
                 parallel architectures; parallel-sequential
                 composition; Petri net; Petri nets; polynomials; single
                 indecomposable system; Software algorithms; Sparse
                 matrices; sparse matrices; sparse matrix; speed-up;
                 system clans; Task analysis; two-level parallelization
                 concept",
}

@Article{Adamek:2020:GFC,
  author =       "Karel Ad{\'a}mek and Sofia Dimoudi and Mike Giles and
                 Wesley Armour",
  title =        "{GPU} Fast Convolution via the Overlap-and-Save Method
                 in Shared Memory",
  journal =      j-TACO,
  volume =       "17",
  number =       "3",
  pages =        "18:1--18:20",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394116",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 28 12:02:00 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/taco.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3394116",
  abstract =     "We present an implementation of the overlap-and-save
                 method, a method for the convolution of very long
                 signals with short response functions, which is
                 tailored to GPUs. We have implemented several FFT
                 algorithms (using the CUDA programming language),
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Al-Mouhamed:2020:RCO,
  author =       "Mayez A. Al-Mouhamed and Ayaz H. Khan and Nazeeruddin
                 Mohammad",
  title =        "A review of {CUDA} optimization techniques and tools
                 for structured grid computing",
  journal =      j-COMPUTING,
  volume =       "102",
  number =       "4",
  pages =        "977--1003",
  month =        apr,
  year =         "2020",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-019-00744-1",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Tue May 12 18:02:15 MDT 2020",
  bibsource =    "http://link.springer.com/journal/607/102/4;
                 http://www.math.utah.edu/pub/tex/bib/computing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Allegretti:2020:OBB,
  author =       "S. Allegretti and F. Bolelli and C. Grana",
  title =        "Optimized Block-Based Algorithms to Label Connected
                 Components on {GPUs}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "2",
  pages =        "423--438",
  month =        feb,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2934683",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "connected components labeling; CUDA; GPU; Parallel
                 processing",
}

@Article{Amos:2020:AQQ,
  author =       "Brandon D. Amos and David R. Easterling and Layne T.
                 Watson and William I. Thacker and Brent S. Castle and
                 Michael W. Trosset",
  title =        "{Algorithm 1007}: {QNSTOP} --- Quasi-{Newton}
                 Algorithm for Stochastic Optimization",
  journal =      j-TOMS,
  volume =       "46",
  number =       "2",
  pages =        "17:1--17:20",
  month =        jun,
  year =         "2020",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3374219",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Fri Jun 12 07:37:53 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3374219",
  abstract =     "QNSTOP consists of serial and parallel (OpenMP)
                 Fortran 2003 codes for the quasi-Newton stochastic
                 optimization method of Castle and Trosset for
                 stochastic search problems. A complete description of
                 QNSTOP for both local search with stochastic objective
                 and global search with ``noisy'' deterministic
                 objective is given here, to the best of our knowledge,
                 for the first time. For stochastic search problems,
                 some convergence theory exists for particular
                 algorithmic choices and parameter values. Both the
                 parallel driver subroutine, which offers several
                 parallel decomposition strategies, and the serial
                 driver subroutine can be used for local stochastic
                 search or global deterministic search, based on an
                 input switch. Some performance data for computational
                 systems biology problems is given.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Arabnejad:2020:SSC,
  author =       "Hamid Arabnejad and Jo{\~a}o Bispo and Jorge G.
                 Barbosa",
  title =        "Source-to-source compilation targeting {OpenMP}-based
                 automatic parallelization of {C} applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "76",
  number =       "9",
  pages =        "6753--6785",
  month =        sep,
  year =         "2020",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-03109-9",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:19:58 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-019-03109-9",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 17 December 2019 Pages: 6753 - 6785",
}

@Article{Awan:2020:CPC,
  author =       "A. A. Awan and A. Jain and C. Chu and H. Subramoni and
                 D. K. Panda",
  title =        "Communication Profiling and Characterization of
                 Deep-Learning Workloads on Clusters With
                 High-Performance Interconnects",
  journal =      j-IEEE-MICRO,
  volume =       "40",
  number =       "1",
  pages =        "35--43",
  month =        jan,
  year =         "2020",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2019.2949986",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Jan 22 06:22:53 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
  keywords =     "Communication Libraries; Deep learning; Distributed
                 computing; Graphics processing units; Heterogeneous
                 networks; Horovod; InfiniBand; Middleware; MVAPICH2
                 MPI; NVLink; Omni-Path; PCIe; Performance Analysis;
                 Performance analysis; Profiling; TensorFlow; Training
                 data",
}

@Article{Baek:2020:ESO,
  author =       "Nakhoon Baek",
  title =        "An emulation scheme for {OpenGL SC 2.0} over
                 {OpenGL}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "76",
  number =       "10",
  pages =        "7951--7960",
  month =        oct,
  year =         "2020",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-018-2399-1",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:19:56 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-018-2399-1",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 02 May 2018 Pages: 7951 - 7960",
}

@Article{Ballard:2020:TPC,
  author =       "Grey Ballard and Alicia Klinvex and Tamara G. Kolda",
  title =        "{TuckerMPI}: a Parallel {C++\slash MPI} Software
                 Package for Large-scale Data Compression via the
                 {Tucker} Tensor Decomposition",
  journal =      j-TOMS,
  volume =       "46",
  number =       "2",
  pages =        "13:1--13:31",
  month =        jun,
  year =         "2020",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3378445",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Fri Jun 12 07:37:53 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3378445",
  abstract =     "Our goal is compression of massive-scale
                 grid-structured data, such as the multi-terabyte output
                 of a high-fidelity computational simulation. For such
                 data sets, we have developed a new software package
                 called TuckerMPI, a parallel C++/MPI software package
                 for compressing distributed data. The approach is based
                 on treating the data as a tensor, i.e., a
                 multidimensional array, and computing its truncated
                 Tucker decomposition, a higher-order analogue to the
                 truncated singular value decomposition of a matrix. The
                 result is a low-rank approximation of the original
                 tensor-structured data. Compression efficiency is
                 achieved by detecting latent global structure within
                 the data, which we contrast to most compression methods
                 that are focused on local structure. In this work, we
                 describe TuckerMPI, our implementation of the truncated
                 Tucker decomposition, including details of the data
                 distribution and in-memory layouts, the parallel and
                 serial implementations of the key kernels, and analysis
                 of the storage, communication, and computational costs.
                 We test the software on 4.5 and 6.7 terabyte data sets
                 distributed across 100 s of nodes (1,000 s of MPI
                 processes), achieving compression ratios between 100
                 and 200,000$ \times $, which equates to 99--99.999\%
                 compression (depending on the desired accuracy) in
                 substantially less time than it would take to even read
                 the same dataset from a parallel file system. Moreover,
                 we show that our method also allows for reconstruction
                 of partial or down-sampled data on a single node,
                 without a parallel computer so long as the
                 reconstructed portion is small enough to fit on a
                 single machine, e.g., in the instance of
                 reconstructing/visualizing a single down-sampled time
                 step or computing summary statistics. The code is
                 available at https://gitlab.com/tensors/TuckerMPI.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Barreda:2020:IFC,
  author =       "Mar{\'\i}a Barreda and Jos{\'e} I. Aliaga and Marc
                 Casas",
  title =        "Iteration-fusing conjugate gradient for sparse linear
                 systems with {MPI + OmpSs}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "76",
  number =       "9",
  pages =        "6669--6689",
  month =        sep,
  year =         "2020",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-03100-4",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:19:58 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-019-03100-4",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 10 December 2019 Pages: 6669 - 6689",
}

@Article{Bernholdt:2020:SMU,
  author =       "David E. Bernholdt and Swen Boehm and George Bosilca
                 and Manjunath Gorentla Venkata and Ryan E. Grant and
                 Thomas Naughton and Howard P. Pritchard and Martin
                 Schulz and Geoffroy R. Vallee",
  title =        "A survey of {MPI} usage in the {US} exascale computing
                 project",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e4851:1--e4851:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4851",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "27 September 2018",
}

@Article{Bombieri:2020:MIB,
  author =       "N. Bombieri and F. Busato and A. Danese and L.
                 Piccolboni and G. Pravadelli",
  title =        "{Mangrove}: An Inference-Based Dynamic Invariant
                 Mining for {GPU} Architectures",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "69",
  number =       "4",
  pages =        "606--620",
  month =        apr,
  year =         "2020",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2019.2953846",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Mar 12 16:58:27 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "GPUs; inference; Invarinant mining",
}

@Article{Cabral:2020:EMO,
  author =       "Frederico L. Cabral and Sanderson L. Gonzaga de
                 Oliveira and Carla Osthoff and Gabriel P. Costa and
                 Diego N. Brand{\~a}o and Mauricio Kischinhevsky",
  title =        "An evaluation of {MPI} and {OpenMP} paradigms in
                 finite-difference explicit methods for {PDEs} on
                 shared-memory multi- and manycore systems",
  journal =      j-CCPE,
  volume =       "32",
  number =       "20",
  pages =        "e5642:1--e5642:??",
  day =          "25",
  month =        oct,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5642",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:20 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "29 December 2019",
}

@Article{Cesarini:2020:CSR,
  author =       "D. Cesarini and A. Bartolini and A. Borghesi and C.
                 Cavazzoni and M. Luisier and L. Benini",
  title =        "Countdown Slack: a Run-Time Library to Reduce Energy
                 Footprint in Large-Scale {MPI} Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "11",
  pages =        "2696--2709",
  year =         "2020",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Sat Aug 15 14:52:38 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Chakraborty:2020:ESE,
  author =       "Sourav Chakraborty and Ignacio Laguna and Murali Emani
                 and Kathryn Mohror and Dhabaleswar K. Panda and Martin
                 Schulz and Hari Subramoni",
  title =        "{ER einit}: Scalable and efficient fault-tolerance for
                 bulk-synchronous {MPI} applications",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e4863:1--e4863:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4863",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 August 2018",
}

@Article{Chang:2020:ADI,
  author =       "Tyler H. Chang and Layne T. Watson and Thomas C. H.
                 Lux and Ali R. Butt and Kirk W. Cameron and Yili Hong",
  title =        "{Algorithm 1012}: {DELAUNAYSPARSE}: Interpolation via
                 a Sparse Subset of the {Delaunay} Triangulation in
                 Medium to High Dimensions",
  journal =      j-TOMS,
  volume =       "46",
  number =       "4",
  pages =        "38:1--38:20",
  month =        nov,
  year =         "2020",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3422818",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Sat Nov 14 07:15:52 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3422818",
  abstract =     "DELAUNAYSPARSE contains both serial and parallel codes
                 written in Fortran 2003 (with OpenMP) for performing
                 medium- to high-dimensional interpolation via the
                 Delaunay triangulation. To accommodate the exponential
                 growth in the size of the Delaunay \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Cho:2020:PMP,
  author =       "Y. Cho and S. Oh and B. Egger",
  title =        "Performance Modeling of Parallel Loops on Multi-Socket
                 Platforms Using Queueing Systems",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "2",
  pages =        "318--331",
  month =        feb,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2938172",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Computational modeling; Dynamic scheduling;
                 multi-socket system; Multicore processing; NUMA;
                 OpenMP; parallel loop; Performance modeling; Predictive
                 models; queueing system; Servers; Time factors",
}

@Article{Daleiden:2020:GPP,
  author =       "Patrick Daleiden and Andreas Stefik and Philip Merlin
                 Uesbeck",
  title =        "{GPU} Programming Productivity in Different
                 Abstraction Paradigms: a Randomized Controlled Trial
                 Comparing {CUDA} and Thrust",
  journal =      j-TOCE,
  volume =       "20",
  number =       "4",
  pages =        "27:1--27:27",
  month =        nov,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418301",
  ISSN =         "1946-6226",
  ISSN-L =       "1946-6226",
  bibdate =      "Sat Mar 20 18:20:46 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toce.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418301",
  abstract =     "Coprocessor architectures in High Performance
                 Computing are prevalent in today's scientific computing
                 clusters and require specialized knowledge for proper
                 utilization. Various alternative paradigms for parallel
                 and offload computation exist, but little \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Computing Education",
  journal-URL =  "https://dl.acm.org/loi/toce",
}

@Article{Davydov:2020:ADS,
  author =       "Denis Davydov and Martin Kronbichler",
  title =        "Algorithms and Data Structures for Matrix-Free Finite
                 Element Operators with {MPI}-Parallel Sparse
                 Multi-Vectors",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "20:1--20:30",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399736",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399736",
  abstract =     "Traditional solution approaches for problems in
                 quantum mechanics scale as $ O(M^3) $, where $M$ is the
                 number of electrons. Various methods have been proposed
                 to address this issue and obtain a linear scaling $
                 O(M)$. One promising formulation is the direct
                 minimization of energy. Such methods take advantage of
                 physical localization of the solution, allowing users
                 to seek it in terms of non-orthogonal orbitals with
                 local support.\par

                 This work proposes a numerically efficient
                 implementation of sparse parallel vectors within the
                 open-source finite element library deal.II. The main
                 algorithmic ingredient is the matrix-free evaluation of
                 the Hamiltonian operator by cell-wise quadrature. Based
                 on an a-priori chosen support for each vector, we
                 develop algorithms and data structures to perform (i)
                 matrix-free sparse matrix multivector products (SpMM),
                 (ii) the projection of an operator onto a sparse
                 sub-space (inner products), and (iii)
                 post-multiplication of a sparse multivector with a
                 square matrix. The node-level performance is analyzed
                 using a roofline model. Our matrix-free implementation
                 of finite element operators with sparse multivectors
                 achieves a performance of 157 GFlop/s on an Intel
                 Cascade Lake processor with 20 cores. Strong and weak
                 scaling results are reported for a representative
                 benchmark problem using quadratic and quartic finite
                 element bases.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Deng:2020:CCB,
  author =       "Y. Deng and T. Li and Y. Luo and X. Zhao",
  title =        "Corrections to {``CUDA-Based Volume Rendering and
                 Inspection for Time-Varying Ultrasonic Testing
                 Datasets''}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "22",
  number =       "1",
  pages =        "4--4",
  month =        jan # "\slash " # feb,
  year =         "2020",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2019.2948481",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu Mar 05 14:46:04 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "See \cite{Deng:2019:CBV}.",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
  keywords =     "Acoustics; Biographies; Inspection; Rendering
                 (computer graphics); Testing",
}

@Article{Diener:2020:HCO,
  author =       "Matthias Diener and Laxmikant V. Kale and Daniel J.
                 Bodony",
  title =        "Heterogeneous computing with {OpenMP} and {Hydra}",
  journal =      j-CCPE,
  volume =       "32",
  number =       "20",
  pages =        "e5728:1--e5728:??",
  day =          "25",
  month =        oct,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5728",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:20 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "07 March 2020",
}

@Article{Eichenberger:2020:HCG,
  author =       "A. E. Eichenberger and G.-T. Bercea and A. Bataev and
                 L. Grinberg and J. K. O'Brien",
  title =        "Hybrid {CPU\slash GPU} tasks optimized for concurrency
                 in {OpenMP}",
  journal =      j-IBM-JRD,
  volume =       "64",
  number =       "3/4",
  pages =        "13:1--13:14",
  month =        may # "\slash " # jul,
  year =         "2020",
  CODEN =        "IBMJAE",
  DOI =          "https://doi.org/10.1147/JRD.2019.2960245",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 3 18:35:26 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Eichstadt:2020:CSM,
  author =       "Jan Eichst{\"a}dt and Martin Vymazal and David Moxey
                 and Joaquim Peir{\'o}",
  title =        "A comparison of the shared-memory parallel programming
                 models {{\em OpenMP}}, {{\em OpenACC}} and {{\em
                 Kokkos}} in the context of implicit solvers for
                 high-order {FEM}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "255",
  number =       "??",
  pages =        "Article 107245",
  month =        oct,
  year =         "2020",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107245",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jun 19 07:19:50 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465520300746",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Elis:2020:QNG,
  author =       "Bengisu Elis and Dai Yang and Olga Pearce and Kathryn
                 Mohror and Martin Schulz",
  title =        "{QMPI}: a next generation {MPI} profiling interface
                 for modern {HPC} platforms",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "96",
  number =       "??",
  pages =        "Article 102635",
  month =        aug,
  year =         "2020",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2020.102635",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Mar 29 11:36:01 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819120300284",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Fan:2020:ALC,
  author =       "Q. Fan and D. J. Lilja and S. S. Sapatnekar",
  title =        "Adaptive-Length Coding of Image Data for Low-Cost
                 Approximate Storage",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "69",
  number =       "2",
  pages =        "239--252",
  month =        feb,
  year =         "2020",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2019.2946795",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Jan 22 06:44:09 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "Adaptive-length coding; approximate storage; Discrete
                 cosine transforms; error-resilience; Huffman coding;
                 Image coding; Reliability; Resilience; Transform
                 coding",
}

@Article{Ferreira:2020:HMM,
  author =       "Kurt Ferreira and Ryan E. Grant and Michael J.
                 Levenhagen and Scott Levy and Taylor Groves",
  title =        "Hardware {MPI} message matching: Insights into {MPI}
                 matching behavior to inform design",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e5150:1--e5150:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5150",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "27 February 2019",
}

@Article{Gao:2020:MES,
  author =       "T. Gao and Y. Guo and B. Zhang and P. Cicotti and Y.
                 Lu and P. Balaji and M. Taufer",
  title =        "Memory-Efficient and Skew-Tolerant {MapReduce} Over
                 {MPI} for Supercomputing Systems",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "12",
  pages =        "2734--2748",
  year =         "2020",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Sat Aug 15 14:52:38 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Gawande:2020:SDL,
  author =       "Nitin A. Gawande and Jeff A. Daily and Charles Siegel
                 and Nathan R. Tallent and Abhinav Vishnu",
  title =        "Scaling Deep Learning workloads: {NVIDIA DGX-1\slash
                 Pascal} and {Intel Knights Landing}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "108",
  number =       "??",
  pages =        "1162--1172",
  month =        jul,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2018.04.073",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jun 19 07:44:16 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X17318599",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Ghazimirsaeed:2020:CAM,
  author =       "S. Mahdieh Ghazimirsaeed and Seyed H. Mirsadeghi and
                 Ahmad Afsahi",
  title =        "Communication-aware message matching in {MPI}",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e4862:1--e4862:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4862",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "21 September 2018",
}

@Article{Gonzalez-Dominguez:2020:CJA,
  author =       "Jorge Gonz{\'a}lez-Dom{\'\i}nguez and Roberto R.
                 Exp{\'o}sito and Ver{\'o}nica Bol{\'o}n-Canedo",
  title =        "{CUDA-JMI}: Acceleration of feature selection on
                 heterogeneous systems",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "102",
  number =       "??",
  pages =        "426--436",
  month =        jan,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.08.031",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Feb 10 12:55:04 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X19312968",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Gutierrez:2020:MAP,
  author =       "Samuel K. Guti{\'e}rrez and Dorian C. Arnold and Kei
                 Davis and Patrick McCormick",
  title =        "On the memory attribution problem: a solution and case
                 study using {MPI}",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e5159:1--e5159:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5159",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "04 February 2019",
}

@Article{Hagedorn:2020:AHP,
  author =       "Bastian Hagedorn and Johannes Lenfers and Thomas
                 K{\oe}hler and Xueying Qin and Sergei Gorlatch and
                 Michel Steuwer",
  title =        "Achieving high-performance the functional way: a
                 functional pearl on expressing high-performance
                 optimizations as rewrite strategies",
  journal =      j-PACMPL,
  volume =       "4",
  number =       "ICFP",
  pages =        "92:1--92:29",
  month =        aug,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3408974",
  bibdate =      "Tue Mar 30 08:10:48 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408974",
  abstract =     "Optimizing programs to run efficiently on modern
                 parallel hardware is hard but crucial for many
                 applications. The predominantly used imperative
                 languages --- like C or OpenCL --- force the programmer
                 to intertwine the code describing functionality and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "92",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Hashmi:2020:FXZ,
  author =       "Jahanzeb Maqbool Hashmi and Ching-Hsiang Chu and
                 Sourav Chakraborty and Mohammadreza Bayatpour and Hari
                 Subramoni and Dhabaleswar K. Panda",
  title =        "{FALCON-X}: Zero-copy {MPI} derived datatype
                 processing on modern {CPU} and {GPU} architectures",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "144",
  number =       "??",
  pages =        "1--13",
  month =        oct,
  year =         "2020",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2020.05.008",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed May 26 16:11:02 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731520302872",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{He:2020:SMO,
  author =       "Feng He and Xiaoshe Dong and Nianjun Zou and Weiguo Wu
                 and Xingjun Zhang",
  title =        "Structured mesh-oriented framework design and
                 optimization for a coarse-grained parallel {CFD} solver
                 based on hybrid {MPI\slash OpenMP} programming",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "76",
  number =       "4",
  pages =        "2815--2841",
  month =        apr,
  year =         "2020",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-03063-6",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jul 25 07:17:55 MDT 2020",
  bibsource =    "http://link.springer.com/journal/11227/76/4;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Huang:2020:POL,
  author =       "Ming Hsiang Huang and Wuu Yang",
  title =        "{PFACC}: an {OpenACC}-like programming model for
                 irregular nested parallelism",
  journal =      j-SPE,
  volume =       "50",
  number =       "10",
  pages =        "1877--1904",
  month =        oct,
  year =         "2020",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.2868",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Fri Feb 26 08:59:23 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/spe.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Softw. Pract. Exp.",
  fjournal =     "Software --- Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "09 July 2020",
}

@Article{Jaksic:2020:HPF,
  author =       "Zoran Jak{\v{s}}i{\'c} and Nicola Cadenelli and David
                 Buchaca Prats and Jord{\`a} Polo and Josep Llu{\'{\i}}s
                 Berral Garcia and David Carrera Perez",
  title =        "A highly parameterizable framework for Conditional
                 Restricted {Boltzmann} Machine based workloads
                 accelerated with {FPGAs} and {OpenCL}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "104",
  number =       "??",
  pages =        "201--211",
  month =        mar,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.10.025",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Feb 10 12:55:06 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X19313676",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Kang:2020:IMC,
  author =       "Q. Kang and S. Lee and K. Hou and R. Ross and A.
                 Agrawal and A. Choudhary and W. Liao",
  title =        "Improving {MPI} Collective {I/O} for High Volume
                 Non-Contiguous Requests With Intra-Node Aggregation",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "11",
  pages =        "2682--2695",
  year =         "2020",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Sat Aug 15 14:52:38 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Kirkham:2020:FEM,
  author =       "Jake Kirkham and Tyler Sorensen and Esin Tureci and
                 Margaret Martonosi",
  title =        "Foundations of empirical memory consistency testing",
  journal =      j-PACMPL,
  volume =       "4",
  number =       "OOPSLA",
  pages =        "226:1--226:29",
  month =        nov,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3428294",
  bibdate =      "Tue Mar 30 08:10:50 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3428294",
  abstract =     "Modern memory consistency models are complex, and it
                 is difficult to reason about the relaxed behaviors that
                 current systems allow. Programming languages, such as C
                 and OpenCL, offer a memory model interface that
                 developers can use to safely write \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "226",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Klinkenberg:2020:CRL,
  author =       "Jannis Klinkenberg and Philipp Samfass and Michael
                 Bader and Christian Terboven and Matthias S.
                 M{\"u}ller",
  title =        "{CHAMELEON}: Reactive Load Balancing for Hybrid {MPI +
                 OpenMP} Task-Parallel Applications",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "138",
  number =       "??",
  pages =        "55--64",
  month =        apr,
  year =         "2020",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:11 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519305180",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Korch:2020:ILE,
  author =       "Matthias Korch and Tim Werner",
  title =        "Improving locality of explicit one-step methods on
                 {GPUs} by tiling across stages and time steps",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "102",
  number =       "??",
  pages =        "889--901",
  month =        jan,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.07.075",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Feb 10 12:55:04 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X19307186",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Larrea:2020:EPM,
  author =       "Ver{\'o}nica G. Vergara Larrea and Reuben D. Budiardja
                 and Rahulkumar Gayatri and Christopher Daley and Oscar
                 Hernandez and Wayne Joubert",
  title =        "Experiences in porting mini-applications to {OpenACC}
                 and {OpenMP} on heterogeneous systems",
  journal =      j-CCPE,
  volume =       "32",
  number =       "20",
  pages =        "e5780:1--e5780:??",
  day =          "25",
  month =        oct,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5780",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:20 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "24 April 2020",
}

@Article{Levy:2020:UVA,
  author =       "Scott Levy and Kurt B. Ferreira and Patrick Widener",
  title =        "The unexpected virtue of almost: Exploiting {MPI}
                 collective operations to approximately coordinate
                 checkpoints",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e4890:1--e4890:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4890",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "09 September 2018",
}

@Article{Li:2020:OOS,
  author =       "Ting Li and Lawrence V. Stanislawski and Tyler
                 Brockmeyer and Shaowen Wang and Ethan Shavers",
  title =        "\pkg{OpenCLC}: an open-source software tool for
                 similarity assessment of linear hydrographic features",
  journal =      j-SOFTWAREX,
  volume =       "11",
  number =       "??",
  pages =        "Article 100401",
  month =        jan # "\slash " # jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1016/j.softx.2020.100401",
  ISSN =         "2352-7110",
  ISSN-L =       "2352-7110",
  bibdate =      "Fri Apr 9 16:04:39 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/softwarex.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S2352711018302747",
  acknowledgement = ack-nhfb,
  fjournal =     "SoftwareX",
  journal-URL =  "https://www.sciencedirect.com/journal/softwarex/issues",
}

@Article{Li:2020:SLF,
  author =       "Qinbo Li and Nima Khademi Kalantari",
  title =        "Synthesizing light field from a single image with
                 variable {MPI} and two network fusion",
  journal =      j-TOG,
  volume =       "39",
  number =       "6",
  pages =        "229:1--229:10",
  month =        nov,
  year =         "2020",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/3414685.3417785",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Sun Mar 28 08:21:45 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tog.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3414685.3417785",
  abstract =     "We propose a learning-based approach to synthesize a
                 light field with a small baseline from a single image.
                 We synthesize the novel view images by first using a
                 convolutional neural network (CNN) to promote the input
                 image into a layered representation \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "229",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "https://dl.acm.org/loi/tog",
}

@Article{Liang:2020:AMD,
  author =       "Jianguo Liang and Rong Hua and Hao Zhang and Wenqiang
                 Zhu and You Fu",
  title =        "Accelerated molecular dynamics simulation of Silicon
                 Crystals on {TaihuLight} using {OpenACC}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "99",
  number =       "??",
  pages =        "Article 102667",
  month =        nov,
  year =         "2020",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2020.102667",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Mar 29 11:36:02 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819120300600",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Liao:2020:DCS,
  author =       "Xiaofei Liao and Long Zheng and Binsheng Zhang and Yu
                 Zhang and Hai Jin and Xuanhua Shi and Yi Lin",
  title =        "Dynamic cluster strategy for hierarchical
                 rollback-recovery protocols in {MPI} {HPC}
                 applications",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e4173:1--e4173:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4173",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "24 May 2017",
}

@Article{Lin:2020:EAM,
  author =       "Bo Lin and Chijie Zhuang and Zhenning Cai and Rong
                 Zeng and Weizhu Bao",
  title =        "An efficient and accurate {MPI}-based parallel
                 simulator for streamer discharges in three dimensions",
  journal =      j-J-COMPUT-PHYS,
  volume =       "401",
  number =       "??",
  pages =        "Article 109026",
  day =          "15",
  month =        jan,
  year =         "2020",
  CODEN =        "JCTPAH",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Mon Mar 9 18:28:21 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcomputphys2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999119307326",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Lin:2020:GTD,
  author =       "Huanxin Lin and Cho-Li Wang",
  title =        "On-{GPU} thread-data remapping for nested branch
                 divergence",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "139",
  number =       "??",
  pages =        "75--86",
  month =        may,
  year =         "2020",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:12 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731518308967",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Losada:2020:FTM,
  author =       "Nuria Losada and Patricia Gonz{\'a}lez and
                 Mar{\'{\i}}a J. Mart{\'{\i}}n and George Bosilca and
                 Aur{\'e}lien Bouteiller and Keita Teranishi",
  title =        "Fault tolerance of {MPI} applications in exascale
                 systems: the {ULFM} solution",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "106",
  number =       "??",
  pages =        "467--481",
  month =        may,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2020.01.026",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jun 19 07:44:13 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X1930860X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Lu:2020:GQO,
  author =       "Q. Lu and J. Yao and H. Guan and P. Gao",
  title =        "{gQoS}: a {QoS}-Oriented {GPU} Virtualization with
                 Adaptive Capacity Sharing",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "4",
  pages =        "843--855",
  month =        apr,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2948753",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
  keywords =     "Cloud computing; cloud computing; GPU virtualization;
                 Graphics processing units; Hardware; QoS control;
                 Quality of service; Resource management; resource
                 scheduling; Virtual machining; Virtualization",
}

@Article{Mantas:2020:HOC,
  author =       "Jos{\'e} M. Mantas and Francesco Vecil",
  title =        "Hybrid {OpenMP--CUDA} parallel implementation of a
                 deterministic solver for ultrashort {DG-MOSFETs}",
  journal =      j-IJHPCA,
  volume =       "34",
  number =       "1",
  pages =        "81--102",
  day =          "1",
  month =        jan,
  year =         "2020",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342019879985",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Sat Jul 25 09:38:31 MDT 2020",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342019879985",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
}

@Article{Mena:2020:GAS,
  author =       "Hermann Mena and Lena-Maria Pfurtscheller and Tony
                 Stillfjord",
  title =        "{GPU} acceleration of splitting schemes applied to
                 differential matrix equations",
  journal =      j-NUMER-ALGORITHMS,
  volume =       "83",
  number =       "1",
  pages =        "395--419",
  month =        jan,
  year =         "2020",
  CODEN =        "NUALEG",
  DOI =          "https://doi.org/10.1007/s11075-019-00687-w",
  ISSN =         "1017-1398 (print), 1572-9265 (electronic)",
  ISSN-L =       "1017-1398",
  bibdate =      "Wed Jan 22 08:40:22 MST 2020",
  bibsource =    "http://link.springer.com/journal/11075/83/1;
                 http://www.math.utah.edu/pub/tex/bib/numeralgorithms.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/s11075-019-00687-w.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Numerical Algorithms",
  journal-URL =  "http://link.springer.com/journal/11075",
}

@Article{Mofrad:2020:GNA,
  author =       "Mohammad Hasanzadeh Mofrad and Rami Melhem and Yousuf
                 Ahmad and Mohammad Hammoud",
  title =        "{Graphite}: a {NUMA}-aware {HPC} system for graph
                 analytics based on a new {MPI * X} parallelism model",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "13",
  number =       "6",
  pages =        "783--797",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3380750.3380751",
  ISSN =         "2150-8097",
  bibdate =      "Thu Apr 2 10:51:28 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  URL =          "https://dl.acm.org/doi/abs/10.14778/3380750.3380751",
  abstract =     "In this paper, we propose a new parallelism model
                 denoted as MPI * X and suggest a linear algebra-based
                 graph analytics system, namely, Graphite, which
                 effectively employs it. MPI * X promotes thread-based
                 partitioning to distribute computation and \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "https://dl.acm.org/loi/pvldb",
}

@Article{Mu:2020:OOB,
  author =       "Jiandong Mu and Wei Zhang and Hao Liang and Sharad
                 Sinha",
  title =        "Optimizing {OpenCL}-Based {CNN} Design on {FPGA} with
                 Comprehensive Design Space Exploration and
                 Collaborative Performance Modeling",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "13:1--13:28",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3397514",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3397514",
  abstract =     "Recent success in applying convolutional neural
                 networks (CNNs) to object detection and classification
                 has sparked great interest in accelerating CNNs using
                 hardware-like field-programmable gate arrays (FPGAs).
                 However, finding an efficient FPGA design \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nandal:2020:NSG,
  author =       "P. Nandal and R. P. Sharma",
  title =        "Numerical simulation on {GPUs} with {CUDA} to study
                 nonlinear dynamics of whistler wave and its turbulent
                 spectrum in radiation belts",
  journal =      j-COMP-PHYS-COMM,
  volume =       "254",
  number =       "??",
  pages =        "Article 107214",
  month =        sep,
  year =         "2020",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107214",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Fri Jun 19 07:19:49 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465520300497",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Naranjo:2020:ASC,
  author =       "Diana M. Naranjo and Sebasti{\'a}n Risco and Carlos de
                 Alfonso and Alfonso P{\'e}rez and Ignacio Blanquer and
                 Germ{\'a}n Molt{\'o}",
  title =        "Accelerated serverless computing based on {GPU}
                 virtualization",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "139",
  number =       "??",
  pages =        "32--42",
  month =        may,
  year =         "2020",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2020.01.004",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:12 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519303533",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Petrovic:2020:BSH,
  author =       "Filip Petrovi{\v{c}} and David St{\v{r}}el{\'a}k and
                 Jana Hozzov{\'a} and Jaroslav Ol'ha and Richard
                 Trembeck{\'y} and Siegfried Benkner and
                 Ji{\v{r}}{\'{\i}} Filipovi{\v{c}}",
  title =        "A benchmark set of highly-efficient {CUDA} and
                 {OpenCL} kernels and its dynamic autotuning with
                 {Kernel Tuning Toolkit}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "108",
  number =       "??",
  pages =        "161--177",
  month =        jul,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2020.02.069",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jun 19 07:44:16 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X19327360",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Prades:2020:MRU,
  author =       "Javier Prades and Baldomero Imbern{\'o}n and Carlos
                 Rea{\~n}o and Jorge Pe{\~n}a-Garc{\'\i}a and Jose Pedro
                 Cer{\'o}n-Carrasco and Federico Silla and Horacio
                 P{\'e}rez-S{\'a}nchez",
  title =        "Maximizing resource usage in multifold molecular
                 dynamics with {rCUDA}",
  journal =      j-IJHPCA,
  volume =       "34",
  number =       "1",
  pages =        "5--19",
  day =          "1",
  month =        jan,
  year =         "2020",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342019857131",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Sat Jul 25 09:38:31 MDT 2020",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342019857131",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
}

@Article{Rasch:2020:DHL,
  author =       "Ari Rasch and Julian Bigge and Martin Wrodarczyk and
                 Richard Schulze and Sergei Gorlatch",
  title =        "{dOCAL}: high-level distributed programming with
                 {OpenCL} and {CUDA}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "76",
  number =       "7",
  pages =        "5117--5138",
  month =        jul,
  year =         "2020",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-019-02829-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jul 25 07:17:59 MDT 2020",
  bibsource =    "http://link.springer.com/journal/11227/76/7;
                 http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Reis:2020:CMC,
  author =       "Lu{\'\i}s Reis and Jo{\~a}o Bispo and Jo{\~a}o M. P.
                 Cardoso",
  title =        "Compilation of {MATLAB} computations to {CPU\slash
                 GPU} via {C\slash OpenCL} generation",
  journal =      j-CCPE,
  volume =       "32",
  number =       "22",
  pages =        "e5854:1--e5854:??",
  day =          "25",
  month =        nov,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5854",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:22 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "01 June 2020",
}

@Article{Renaud:2020:IMS,
  author =       "Nicolas Renaud and Yong Jung and Vasant Honavar and
                 Cunliang Geng and Alexandre M. J. J. Bonvin and Li C.
                 Xue",
  title =        "\pkg{iScore}: an {MPI} supported software for ranking
                 protein-protein docking models based on a random walk
                 graph kernel and support vector machines",
  journal =      j-SOFTWAREX,
  volume =       "11",
  number =       "??",
  pages =        "Article 100462",
  month =        jan # "\slash " # jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1016/j.softx.2020.100462",
  ISSN =         "2352-7110",
  ISSN-L =       "2352-7110",
  bibdate =      "Fri Apr 9 16:04:39 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/softwarex.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S2352711019303061",
  acknowledgement = ack-nhfb,
  fjournal =     "SoftwareX",
  journal-URL =  "https://www.sciencedirect.com/journal/softwarex/issues",
}

@Article{Russek:2020:SLC,
  author =       "Pawe{\l} Russek and Pawe{\l} Russek and Ernest Jamro
                 and Agnieszka Dabrowska-Boruch and Kazimierz Wiatr",
  title =        "A study of the loops control for reconfigurable
                 computing with {OpenCL} in the {LABS} local search
                 problem",
  journal =      j-IJHPCA,
  volume =       "34",
  number =       "1",
  pages =        "103--114",
  day =          "1",
  month =        jan,
  year =         "2020",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342019868515",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Sat Jul 25 09:38:31 MDT 2020",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342019868515",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
}

@Article{Salinas:2020:FEI,
  author =       "{\'A}lvaro Salinas and Claudio Torres and Orlando
                 Ayala",
  title =        "A fast and efficient integration of boundary
                 conditions into a unified {CUDA} Kernel for a shallow
                 water solver lattice {Boltzmann} Method",
  journal =      j-COMP-PHYS-COMM,
  volume =       "249",
  number =       "??",
  pages =        "Article 107009",
  month =        apr,
  year =         "2020",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Mar 2 13:57:36 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465519303443",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Samfass:2020:LTO,
  author =       "Philipp Samfass and Tobias Weinzierl and Dominic E.
                 Charrier and Michael Bader",
  title =        "Lightweight task offloading exploiting {MPI} wait
                 times for parallel adaptive mesh refinement",
  journal =      j-CCPE,
  volume =       "32",
  number =       "24",
  pages =        "e5916:1--e5916:??",
  day =          "25",
  month =        dec,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5916",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:23 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "09 July 2020",
}

@Article{Shekofteh:2020:CEC,
  author =       "S.-Kazen Shekofteh and Hamid Noori and Mahmoud
                 Naghibzadeh and Holger Fr{\"o}ning and Hadi Sadog
                 Yazdi",
  title =        "{cCUDA}: Effective Co-Scheduling of Concurrent Kernels
                 on {GPUs}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "4",
  pages =        "766--778",
  month =        apr,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2944602",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Analytical models; Benchmark testing; concurrent
                 kernel execution; Graphics processing units; Hardware;
                 Kernel; resource management; Scheduling; scheduling;
                 stream",
}

@Article{Shen:2020:GPC,
  author =       "Qi Shen and Craig Sharp and Richard Davison and Gary
                 Ushaw and Rajiv Ranjan and Albert Y. Zomaya and Graham
                 Morgan",
  title =        "A general purpose contention manager for software
                 transactions on the {GPU}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "139",
  number =       "??",
  pages =        "1--17",
  month =        may,
  year =         "2020",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2019.12.018",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:12 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519301376",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Silla:2020:IPP,
  author =       "Federico Silla and Javier Prades and Elvira Baydal and
                 Carlos Rea{\~n}o",
  title =        "Improving the performance of physics applications in
                 atom-based clusters with {rCUDA}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "137",
  number =       "??",
  pages =        "160--178",
  month =        mar,
  year =         "2020",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:11 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519304034",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Skjellum:2020:FSI,
  author =       "Anthony Skjellum and Purushotham V. Bangalore and Ryan
                 E. Grant",
  title =        "Foreword to the Special Issue of the {Workshop on
                 Exascale MPI (ExaMPI 2017)}",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e5459:1--e5459:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5459",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "18 July 2019",
}

@Article{Spiliotis:2020:PII,
  author =       "Iraklis M. Spiliotis and Michael P. Bekakos and
                 Yiannis S. Boutalis",
  title =        "Parallel implementation of the {Image Block
                 Representation} using {OpenMP}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "137",
  number =       "??",
  pages =        "134--147",
  month =        mar,
  year =         "2020",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:11 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519307622",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Stpiczynski:2020:ALB,
  author =       "Przemys{\l}aw Stpiczy{\'n}ski",
  title =        "Algorithmic and language-based optimization of
                 {Marsa-LFIB4} pseudorandom number generator using
                 {OpenMP}, {OpenACC} and {CUDA}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "137",
  number =       "??",
  pages =        "238--245",
  month =        mar,
  year =         "2020",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2019.12.004",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:11 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519304885",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Sun:2020:RTS,
  author =       "J. Sun and N. Guan and F. Li and H. Gao and C. Shi and
                 W. Yi",
  title =        "Real-Time Scheduling and Analysis of {OpenMP} {DAG}
                 Tasks Supporting Nested Parallelism",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "69",
  number =       "9",
  pages =        "1335--1348",
  month =        sep,
  year =         "2020",
  CODEN =        "ITCOB4",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Aug 12 14:58:16 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Tiotto:2020:OCO,
  author =       "E. Tiotto and B. Mahjour and W. Tsang and X. Xue and
                 T. Islam and W. Chen",
  title =        "{OpenMP 4.5} compiler optimization for {GPU}
                 offloading",
  journal =      j-IBM-JRD,
  volume =       "64",
  number =       "3/4",
  pages =        "14:1--14:11",
  month =        may # "\slash " # jul,
  year =         "2020",
  CODEN =        "IBMJAE",
  DOI =          "https://doi.org/10.1147/JRD.2019.2962428",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 3 18:35:26 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/super.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Traff:2020:SIS,
  author =       "Jesper Larsson Tr{\"a}ff and Torsten Hoefler",
  title =        "Special issue: Selected papers from {EuroMPI 2019}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "99",
  number =       "??",
  pages =        "Article 102695",
  month =        nov,
  year =         "2020",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2020.102695",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Mar 29 11:36:02 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819120300855",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Tsiolakis:2020:NPG,
  author =       "Vasileios Tsiolakis and Matteo Giacomini and Ruben
                 Sevilla and Carsten Othmer and Antonio Huerta",
  title =        "Nonintrusive proper generalised decomposition for
                 parametrised incompressible flow problems in
                 {OpenFOAM}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "249",
  number =       "??",
  pages =        "Article 107013",
  month =        apr,
  year =         "2020",
  CODEN =        "CPHCBZ",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Mar 2 13:57:36 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465519303479",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Turchetto:2020:GDS,
  author =       "M. Turchetto and A. D. Pal{\`u} and R. Vacondio",
  title =        "A General Design for a Scalable {MPI-GPU}
                 Multi-Resolution {2D} Numerical Solver",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "5",
  pages =        "1036--1047",
  month =        may,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2961909",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 20 10:08:58 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
  keywords =     "AMR; CUDA; dynamic load balancing; hilbert space
                 filling curves; MPI; multi-GPU; multi-resolution grid;
                 shallow water equations (SWE)",
}

@Article{Valero-Lara:2020:SFA,
  author =       "Pedro Valero-Lara and Sandra Catal{\'a}n and Xavier
                 Martorell and Tetsuzo Usui and Jes{\'u}s Labarta",
  title =        "{sLASs}: a fully automatic auto-tuned linear algebra
                 library based on {OpenMP} extensions implemented in
                 {OmpSs} ({LASs} Library)",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "138",
  number =       "??",
  pages =        "153--171",
  month =        apr,
  year =         "2020",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:11 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519303417",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Wang:2020:EPE,
  author =       "X. Wang and X. Qian and A. Knoll and K. Huang",
  title =        "Efficient Performance Estimation and Work-Group Size
                 Pruning for {OpenCL} Kernels on {GPUs}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "5",
  pages =        "1089--1106",
  month =        may,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2958343",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 20 10:08:58 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
  keywords =     "Analytical models; Estimation; GPU; Graphics
                 processing units; Hardware; Kernel; Measurement;
                 OpenCL; performance estimation; performance tuning;
                 Runtime; work-group size",
}

@Article{Weng:2020:CMS,
  author =       "Tien-Hsiung Weng and Kuan-Ching Li and Zhiliu Yang and
                 Chen Liu",
  title =        "On the code modernization of shared sampling alpha
                 matting with {OpenMP}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "107",
  number =       "??",
  pages =        "177--191",
  month =        jun,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.12.012",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jun 19 07:44:14 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X19314116",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{White:2020:OPP,
  author =       "Sam White and Laxmikant V. Kale",
  title =        "Optimizing point-to-point communication between
                 adaptive {MPI} endpoints in shared memory",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e4467:1--e4467:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4467",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 March 2018",
}

@Article{Yu:2020:EPW,
  author =       "C. Yu and S. Tsao",
  title =        "Efficient and Portable Workgroup Size Tuning",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "2",
  pages =        "455--469",
  month =        feb,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2937295",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "automatic performance tuning; Computational modeling;
                 Graphics processing units; Hardware; Indexes; Kernel;
                 microbenchmarking; OpenCL; Performance evaluation;
                 Tuning; workgroup size selection",
}

@Article{Zarebavani:2020:CCB,
  author =       "B. Zarebavani and F. Jafarinejad and M. Hashemi and S.
                 Salehkaleybar",
  title =        "{cuPC}: {CUDA}-Based Parallel {PC} Algorithm for
                 Causal Structure Learning on {GPU}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "3",
  pages =        "530--542",
  month =        mar,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2939126",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Bayes methods; Bayesian networks; causal discovery;
                 CUDA; GPU; Graphical models; Graphics processing units;
                 machine learning; Markov processes; Parallel
                 algorithms; parallel processing; PC algorithm;
                 Scalability",
}

@Article{Zhang:2020:CTE,
  author =       "T. Zhang and X. Liu and X. Wang and A. Walid",
  title =        "{cuTensor-Tubal}: Efficient Primitives for Tubal-Rank
                 Tensor Learning Operations on {GPUs}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "3",
  pages =        "595--610",
  month =        mar,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2940192",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
  keywords =     "Computational modeling; Computer architecture;
                 cuTensor-tubal library; Frequency-domain analysis; GPU;
                 Graphics processing units; Libraries; Low-tubal-rank
                 tensor decomposition; Matrix decomposition; t-SVD;
                 tensor completion",
}

@Article{Zhou:2020:CHM,
  author =       "Huan Zhou and Jos{\'e} Gracia and Naweiluo Zhou and
                 Ralf Schneider",
  title =        "Collectives in hybrid {MPI+MPI} code: Design, practice
                 and performance",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "99",
  number =       "??",
  pages =        "Article 102669",
  month =        nov,
  year =         "2020",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2020.102669",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Mar 29 11:36:02 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819120300612",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Zhou:2020:EOP,
  author =       "Hongyang Zhou and G{\'a}bor T{\'o}th",
  title =        "Efficient {OpenMP} parallelization to a complex {MPI}
                 parallel magnetohydrodynamics code",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "139",
  number =       "??",
  pages =        "65--74",
  month =        may,
  year =         "2020",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Mar 18 09:26:12 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731519304903",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Aldinucci:2021:PPS,
  author =       "Marco Aldinucci and Valentina Cesare and Iacopo
                 Colonnelli and Alberto Riccardo Martinelli and Gianluca
                 Mittone and Barbara Cantalupo and Carlo Cavazzoni and
                 Maurizio Drocco",
  title =        "Practical parallelization of scientific applications
                 with {OpenMP}, {OpenACC} and {MPI}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "157",
  number =       "??",
  pages =        "13--29",
  month =        nov,
  year =         "2021",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2021.05.017",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Feb 10 06:39:21 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731521001295",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Andoh:2021:AMM,
  author =       "Yoshimichi Andoh and Shin-ichi Ichikawa and Tatsuya
                 Sakashita and Noriyuki Yoshii and Susumu Okazaki",
  title =        "Algorithm to minimize {MPI} communications in the
                 parallelized fast multipole method combined with
                 molecular dynamics calculations",
  journal =      j-J-COMPUT-CHEM,
  volume =       "42",
  number =       "15",
  pages =        "1073--1087",
  day =          "5",
  month =        jun,
  year =         "2021",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.26524",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Mon May 17 16:26:14 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 http://www.math.utah.edu/pub/tex/bib/jcomputchem2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Comput. Chem.",
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0192-8651",
  onlinedate =   "29 March 2021",
}

@Article{Antonelli:2021:CBI,
  author =       "L. Antonelli and E. Francomano and F. Gregoretti",
  title =        "A {CUDA}-based implementation of an improved {SPH}
                 method on {GPU}",
  journal =      j-APPL-MATH-COMP,
  volume =       "409",
  number =       "??",
  pages =        "Article 125482",
  day =          "15",
  month =        nov,
  year =         "2021",
  CODEN =        "AMHCBQ",
  DOI =          "https://doi.org/10.1016/j.amc.2020.125482",
  ISSN =         "0096-3003 (print), 1873-5649 (electronic)",
  ISSN-L =       "0096-3003",
  bibdate =      "Mon Jan 31 07:58:57 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/applmathcomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0096300320304410",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Mathematics and Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00963003",
}

@Article{Betcke:2021:DHP,
  author =       "Timo Betcke and Matthew W. Scroggs",
  title =        "Designing a High-Performance Boundary Element Library
                 With {OpenCL} and {Numba}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "23",
  number =       "4",
  pages =        "18--28",
  month =        jul # "\slash " # aug,
  year =         "2021",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2021.3085420",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu Jul 29 07:00:57 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Cesarini:2021:CRT,
  author =       "D. Cesarini and A. Bartolini and P. Bonf{\`a} and C.
                 Cavazzoni and L. Benini",
  title =        "{COUNTDOWN}: a Run-Time Library for
                 Performance-Neutral Energy Saving in {MPI}
                 Applications",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "70",
  number =       "5",
  pages =        "682--695",
  year =         "2021",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2020.2995269",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Apr 8 06:29:24 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Chapp:2021:IDS,
  author =       "Dylan Chapp and Nigel Tan and Sanjukta Bhowmick and
                 Michela Taufer",
  title =        "Identifying Degree and Sources of Non-Determinism in
                 {MPI} Applications Via Graph Kernels",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "32",
  number =       "12",
  pages =        "2936--2952",
  month =        dec,
  year =         "2021",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2021.3081530",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Jun 4 09:55:50 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Chen:2021:CCR,
  author =       "Genlang Chen and Jiajian Zhang and Chaoyi Pang",
  title =        "{CRState}: checkpoint/restart of {OpenCL} program for
                 in-kernel applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "6",
  pages =        "5426--5467",
  month =        jun,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03460-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:20:01 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03460-2",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 06 November 2020 Pages: 5426 - 5467",
}

@Article{Dalcin:2021:MSU,
  author =       "Lisandro Dalcin and Yao-Lung L. Fang",
  title =        "{mpi4py}: Status Update After 12 Years of
                 Development",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "23",
  number =       "4",
  pages =        "47--54",
  month =        jul # "\slash " # aug,
  year =         "2021",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2021.3083216",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu Jul 29 07:00:57 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Dosanjh:2021:IEM,
  author =       "Matthew G. F. Dosanjh and Andrew Worley and Derek
                 Schafer and Prema Soundararajan and Sheikh Ghafoor and
                 Anthony Skjellum and Purushotham V. Bangalore and Ryan
                 E. Grant",
  title =        "Implementation and evaluation of {MPI 4.0} partitioned
                 communication libraries",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "108",
  number =       "??",
  pages =        "??--??",
  month =        dec,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102827",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:17 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000752",
  acknowledgement = ack-nhfb,
  articleno =    "102827",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Ferreira:2021:EMR,
  author =       "Kurt B. Ferreira and Scott Levy",
  title =        "Evaluating {MPI} resource usage summary statistics",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "108",
  number =       "??",
  pages =        "??--??",
  month =        dec,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102825",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:17 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000739",
  acknowledgement = ack-nhfb,
  articleno =    "102825",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gong:2021:TDG,
  author =       "Dunwei Gong and Baicai Sun and Xiangjuan Yao and Tian
                 Tian",
  title =        "Test Data Generation for Path Coverage of {MPI}
                 Programs Using {SAEO}",
  journal =      j-TOSEM,
  volume =       "30",
  number =       "2",
  pages =        "17:1--17:37",
  month =        mar,
  year =         "2021",
  CODEN =        "ATSMER",
  DOI =          "https://doi.org/10.1145/3423132",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Thu Mar 18 06:18:01 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tosem.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3423132",
  abstract =     "Message-passing interface (MPI) programs, a typical
                 kind of parallel programs, have been commonly used in
                 various applications. However, it generally takes
                 exhaustive computation to run these programs when
                 generating test data to test them. In this \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  journal-URL =  "https://dl.acm.org/loi/tosem",
}

@Article{Hahne:2021:APP,
  author =       "Jens Hahne and Stephanie Friedhoff and Matthias
                 Bolten",
  title =        "{Algorithm 1016}: {PyMGRIT}: a {Python} Package for
                 the Parallel-in-time Method {MGRIT}",
  journal =      j-TOMS,
  volume =       "47",
  number =       "2",
  pages =        "19:1--19:22",
  month =        apr,
  year =         "2021",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3446979",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Apr 27 08:23:28 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446979",
  abstract =     "In this article, we introduce the Python framework
                 PyMGRIT, which implements the
                 multigrid-reduction-in-time (MGRIT) algorithm for
                 solving (non-)linear systems arising from the
                 discretization of time-dependent problems. The MGRIT
                 algorithm is a reduction-based iterative method that
                 allows parallel-in-time simulations, i.e., calculating
                 multiple time steps simultaneously in a simulation,
                 using a time-grid hierarchy. The PyMGRIT framework
                 includes many different variants of the MGRIT
                 algorithm, ranging from different multigrid cycle types
                 and relaxation schemes, various coarsening strategies,
                 including time-only and space-time coarsening, and the
                 ability to utilize different time integrators on
                 different levels in the multigrid hierarchy. The
                 comprehensive documentation with tutorials and many
                 examples and the fully documented code allow an easy
                 start into the work with the package. The functionality
                 of the code is ensured by automated serial and parallel
                 tests using continuous integration. PyMGRIT supports
                 serial runs suitable for prototyping and testing of new
                 approaches, as well as parallel runs using the Message
                 Passing Interface (MPI). In this manuscript, we
                 describe the implementation of the MGRIT algorithm in
                 PyMGRIT and present the usage from both a user and a
                 developer point of view. Three examples illustrate
                 different aspects of the package itself, especially
                 running tests with pure time parallelism, as well as
                 space-time parallelism through the coupling of PyMGRIT
                 with PETSc or Firedrake.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Halbiniak:2021:EOH,
  author =       "Kamil Halbiniak and Lukasz Szustak and Tomasz Olas and
                 Roman Wyrzykowski and Pawel Gepner",
  title =        "Exploration of {OpenCL} Heterogeneous Programming for
                 Porting Solidification Modeling to {CPU-GPU}
                 Platforms",
  journal =      j-CCPE,
  volume =       "33",
  number =       "4",
  pages =        "e6011:1--e6011:??",
  day =          "25",
  month =        feb,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.6011",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue May 18 08:31:21 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "09 October 2020",
}

@Article{Ho:2021:GFD,
  author =       "Nhut-Minh Ho and Himeshi {De Silva} and Weng-Fai
                 Wong",
  title =        "{GRAM}: a Framework for Dynamically Mixing Precisions
                 in {GPU} Applications",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "19:1--19:24",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3441830",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3441830",
  abstract =     "This article presents GRAM (GPU-based Runtime Adaption
                 for Mixed-precision) a framework for the effective use
                 of mixed precision arithmetic for CUDA programs. Our
                 method provides a fine-grain tradeoff between output
                 error and performance. It can create many variants that
                 satisfy different accuracy requirements by assigning
                 different groups of threads to different precision
                 levels adaptively at runtime. To widen the range of
                 applications that can benefit from its approximation,
                 GRAM comes with an optional half-precision approximate
                 math library. Using GRAM, we can trade off precision
                 for any performance improvement of up to 540\%,
                 depending on the application and accuracy
                 requirement.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Hori:2021:ISM,
  author =       "Atsushi Hori and Emmanuel Jeannot and George Bosilca
                 and Takahiro Ogura and Balazs Gerofi and Jie Yin and
                 Yutaka Ishikawa",
  title =        "An international survey on {MPI} users",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "108",
  number =       "??",
  pages =        "??--??",
  month =        dec,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102853",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:17 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000983",
  acknowledgement = ack-nhfb,
  articleno =    "102853",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Huang:2021:LBI,
  author =       "Liang-Tsung Huang and Kai-Cheng Wei and Jian-An Wang",
  title =        "A lightweight {BLASTP} and its implementation on {CUDA
                 GPUs}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "1",
  pages =        "322--342",
  month =        jan,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03267-1",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:19:58 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03267-1",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 07 April 2020 Pages: 322 - 342",
}

@Article{Jalowiecki:2021:BFS,
  author =       "Konrad Ja{\l}owiecki and Marek M. Rams and
                 Bart{\l}omiej Gardas",
  title =        "Brute-forcing spin-glass problems with {CUDA}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "260",
  number =       "??",
  pages =        "Article 107728",
  month =        mar,
  year =         "2021",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107728",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Mar 13 08:21:41 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S001046552030360X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Keppens:2021:MAP,
  author =       "Rony Keppens and Jannis Teunissen and Chun Xia and
                 Oliver Porth",
  title =        "{MPI-AMRVAC}: a parallel, grid-adaptive {PDE}
                 toolkit",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "81",
  number =       "??",
  pages =        "316--333",
  day =          "1",
  month =        jan,
  year =         "2021",
  CODEN =        "CMAPDK",
  DOI =          "https://doi.org/10.1016/j.camwa.2020.03.023",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  bibdate =      "Sat Mar 13 10:03:14 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computmathappl2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122120301279",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Kim:2021:GRP,
  author =       "Mingyu Kim and Nakhoon Baek",
  title =        "A {3D} graphics rendering pipeline implementation
                 based on the {openCL} massively parallel processing",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "7",
  pages =        "7351--7367",
  month =        jul,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03581-8",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:32 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03581-8",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Klemm:2021:OAH,
  author =       "Michael Klemm and Eduardo Qui{\~n}ones and Tucker Taft
                 and Dirk Ziegenbein and Sara Royuela",
  title =        "The {OpenMP API} for High Integrity Systems: Moving
                 Responsibility from Users to Vendors",
  journal =      j-SIGADA-LETTERS,
  volume =       "40",
  number =       "2",
  pages =        "48--50",
  month =        apr,
  year =         "2021",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/3463478.3463480",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "0736-721X",
  bibdate =      "Mon Jun 28 15:50:16 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigada.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3463478.3463480",
  abstract =     "OpenMP is traditionally focused on boosting
                 performance in HPC systems. However, other domains are
                 showing an increasing interest in the use of OpenMP by
                 virtue of key aspects introduced in recent versions of
                 the specification: the tasking model, the accelerator
                 model, and other features like the requires and the
                 assumes directives, which allow defining certain
                 contracts. One example is the safety-critical embedded
                 domain, where several efforts have been initiated
                 towards the adoption of OpenMP. However, the OpenMP
                 specification states that ``application developers are
                 responsible for correctly using the OpenMP API to
                 produce a conforming program'',being not acceptable in
                 high integrity systems, where aspects such as
                 reliability and resiliency have to be ensured at
                 different levels of criticality. In this scope,
                 programming languages like Ada propose a different
                 paradigm by exposing fewer features to the user, and
                 leaving the responsibility of safely exploiting the
                 full underlying architecture to the compiler and the
                 runtime systems, instead. The philosophy behind this
                 kind of model is to move the responsibility of
                 producing correct parallel programs from users to
                 vendors. In this panel, actors from different domains
                 involved in the use of parallel programming models for
                 the development of high-integrity systems share their
                 thoughts about this topic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGADA Ada Letters",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J32",
}

@Article{Kohnke:2021:CFM,
  author =       "Bartosz Kohnke and Carsten Kutzner and Andreas
                 Beckmann and Gert Lube and Ivo Kabadshow and Holger
                 Dachsel and Helmut Grubm{\"u}ller",
  title =        "A {CUDA} fast multipole method with highly efficient
                 {M2L} far field evaluation",
  journal =      j-IJHPCA,
  volume =       "35",
  number =       "1",
  pages =        "97--117",
  day =          "1",
  month =        jan,
  year =         "2021",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342020964857",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue May 18 15:46:08 MDT 2021",
  bibsource =    "http://hpc.sagepub.com/;
                 http://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 http://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://journals.sagepub.com/doi/full/10.1177/1094342020964857",
  acknowledgement = ack-nhfb,
  ajournal =     "Int. J. High Perform. Comput. Appl.",
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "https://journals.sagepub.com/home/hpc",
}

@Article{Lambert:2021:OOFa,
  author =       "Jacob Lambert and Seyong Lee and Jeffrey S. Vetter and
                 Allen D. Malony",
  title =        "Optimization with the {OpenACC-to-FPGA} framework on
                 the {Arria 10} and {Stratix 10} {FPGAs}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "104--105",
  number =       "??",
  pages =        "??--??",
  month =        jul,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102784",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:16 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000417",
  acknowledgement = ack-nhfb,
  articleno =    "102784",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Lambert:2021:OOFb,
  author =       "Jacob Lambert and Seyong Lee and Jeffrey S. Vetter and
                 Allen D. Malony",
  title =        "Optimization with the {OpenACC}-to-{FPGA} framework on
                 the Arria 10 and Stratix 10 {FPGAs}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "104--105",
  number =       "??",
  pages =        "??--??",
  month =        jul,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102784",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:16 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000417",
  acknowledgement = ack-nhfb,
  articleno =    "102784",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Liu:2021:BMN,
  author =       "Feilong Liu and Claude Barthels and Spyros Blanas and
                 Hideaki Kimura and Garret Swart",
  title =        "Beyond {MPI}: New Communication Interfaces for
                 Database Systems and Data-Intensive Applications",
  journal =      j-SIGMOD,
  volume =       "49",
  number =       "4",
  pages =        "12--17",
  month =        mar,
  year =         "2021",
  CODEN =        "SRECD8",
  DOI =          "https://doi.org/10.1145/3456859.3456862",
  ISSN =         "0163-5808 (print), 1943-5835 (electronic)",
  ISSN-L =       "0163-5808",
  bibdate =      "Thu Mar 11 06:12:21 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigmod.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3456859.3456862",
  abstract =     "Networks with Remote Direct Memory Access (RDMA)
                 support are becoming increasingly common. RDMA,
                 however, offers a limited programming interface to
                 remote memory that consists of read, write and atomic
                 operations. With RDMA alone, completing the most basic
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGMOD Record (ACM Special Interest Group on
                 Management of Data)",
  journal-URL =  "https://dl.acm.org/loi/sigmod",
}

@Article{Lyu:2021:FFA,
  author =       "Xing-long Lyu and Tiexiang Li and Tsung-ming Huang and
                 Jia-wei Lin and Wen-wei Lin and Sheng Wang",
  title =        "{FAME}: Fast Algorithms for {Maxwell}'s Equations for
                 Three-dimensional Photonic Crystals",
  journal =      j-TOMS,
  volume =       "47",
  number =       "3",
  pages =        "26:1--26:24",
  month =        jun,
  year =         "2021",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3446329",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Sun Jun 27 07:42:02 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446329",
  abstract =     "In this article, we propose the Fast Algorithms for
                 Maxwell's Equations (FAME) package for solving
                 Maxwell's equations for modeling three-dimensional
                 photonic crystals. FAME combines the null-space free
                 method with fast Fourier transform (FFT)-based
                 matrix-vector multiplications to solve the generalized
                 eigenvalue problems (GEPs) arising from Yee's
                 discretization. The GEPs are transformed into a
                 null-space free standard eigenvalue problem with a
                 Hermitian positive-definite coefficient matrix. The
                 computation times for FFT-based matrix-vector
                 multiplications with matrices of dimension 7 million
                 are only $ 0.33 $ and $ 3.6 \times 10^{-3} $ seconds
                 using MATLAB with an Intel Xeon CPU and CUDA C++
                 programming with a single NVIDIA Tesla P100 GPU,
                 respectively. Such multiplications significantly reduce
                 the computational costs of the conjugate gradient
                 method for solving linear systems. We successfully use
                 FAME on a single P100 GPU to solve a set of GEPs with
                 matrices of dimension more than 19 million, in 127 to
                 191 seconds per problem. These results demonstrate the
                 potential of our proposed package to enable large-scale
                 numerical simulations for novel physical discoveries
                 and engineering applications of photonic crystals.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Ma:2021:CSB,
  author =       "Wenpeng Ma and Wu Yuan and Xiazhen Liu",
  title =        "A Comparative Study of Block Incomplete Sparse
                 Approximate Inverses Preconditioning on {Tesla K20} and
                 {V100} {GPUs}",
  journal =      j-ALGORITHMS-BASEL,
  volume =       "14",
  number =       "7",
  month =        jul,
  year =         "2021",
  CODEN =        "ALGOCH",
  DOI =          "https://doi.org/10.3390/a14070204",
  ISSN =         "1999-4893 (electronic)",
  ISSN-L =       "1999-4893",
  bibdate =      "Fri Jul 23 15:05:28 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/algorithms.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.mdpi.com/1999-4893/14/7/204",
  acknowledgement = ack-nhfb,
  articleno =    "204",
  fjournal =     "Algorithms (Basel)",
  journal-URL =  "https://www.mdpi.com/journal/algorithms",
  pagecount =    "??",
}

@Article{Margolin:2021:TBF,
  author =       "Alexander Margolin and Amnon Barak",
  title =        "Tree-based fault-tolerant collective operations for
                 {MPI}",
  journal =      j-CCPE,
  volume =       "33",
  number =       "14",
  pages =        "e5826:1--e5826:??",
  day =          "25",
  month =        jul,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5826",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:49:55 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "15 June 2020",
}

@Article{Martinez-Noriega:2021:COE,
  author =       "Edgar Josafat Martinez-Noriega and Syunji Yazaki and
                 Tetsu Narumi",
  title =        "{CUDA} offloading for energy-efficient and
                 high-frame-rate simulations using tablets",
  journal =      j-CCPE,
  volume =       "33",
  number =       "2",
  pages =        "e5488:1--e5488:??",
  day =          "25",
  month =        jan,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5488",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue May 18 08:31:19 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "23 August 2019",
}

@Article{Matwiejew:2021:QFP,
  author =       "Edric Matwiejew and Jingbo Wang",
  title =        "{QSW\_MPI}: a framework for parallel simulation of
                 quantum stochastic walks",
  journal =      j-COMP-PHYS-COMM,
  volume =       "260",
  number =       "??",
  pages =        "Article 107724",
  month =        mar,
  year =         "2021",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107724",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Mar 13 08:21:41 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465520303581",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Meyer:2021:IBH,
  author =       "Bruno Henrique Meyer and Aurora Trinidad Ramirez Pozo
                 and Wagner M. Nunan Zola",
  title =        "Improving {Barnes--Hut} {$t$-SNE} Algorithm in Modern
                 {GPU} Architectures with Random Forest {KNN} and
                 Simulated Wide-Warp",
  journal =      j-JETC,
  volume =       "17",
  number =       "4",
  pages =        "53:1--53:26",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447779",
  ISSN =         "1550-4832",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 14 06:51:04 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 http://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447779",
  abstract =     "The $t$-Distributed Stochastic Neighbor Embedding
                 (t-SNE) is a widely used technique for dimensionality
                 reduction but is limited by its scalability when
                 applied to large datasets. Recently, BH-tSNE was
                 proposed; this is a successful approximation that
                 transforms a step of the original algorithm into an
                 N-Body simulation problem that can be solved by a
                 modified Barnes-Hut algorithm. However, this
                 improvement still has limitations to process large data
                 volumes (millions of records). Late studies, such as
                 $t$-SNE-CUDA, have used GPUs to implement highly
                 parallel BH-tSNE. In this research we have developed a
                 new GPU BH-tSNE implementation that produces the
                 embedding of multidimensional data points into
                 three-dimensional space. We examine scalability issues
                 in two of the most expensive steps of GPU BH-tSNE by
                 using efficient memory access strategies, recent
                 acceleration techniques, and a new approach to compute
                 the KNN graph structure used in BH-tSNE with GPU. Our
                 design allows up to 460\% faster execution when
                 compared to the $t$-SNE-CUDA implementation. Although
                 our SIMD acceleration techniques were used in a modern
                 GPU setup, we have also verified a potential for
                 applications in the context of multi-core processors.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "https://dl.acm.org/loi/jetc",
}

@Article{Muller:2021:MAE,
  author =       "Stefan K. Muller and Jan Hoffmann",
  title =        "Modeling and analyzing evaluation cost of {CUDA}
                 kernels",
  journal =      j-PACMPL,
  volume =       "5",
  number =       "POPL",
  pages =        "25:1--25:31",
  month =        jan,
  year =         "2021",
  DOI =          "https://doi.org/10.1145/3434306",
  bibdate =      "Tue Mar 30 08:10:58 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434306",
  abstract =     "General-purpose programming on GPUs (GPGPU) is
                 becoming increasingly in vogue as applications such as
                 machine learning and scientific computing demand high
                 throughput in vector-parallel applications. NVIDIA's
                 CUDA toolkit seeks to make GPGPU programming \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Munch:2021:HDE,
  author =       "Peter Munch and Katharina Kormann and Martin
                 Kronbichler",
  title =        "\pkg{hyper.deal}: an Efficient, Matrix-free
                 Finite-element Library for High-dimensional Partial
                 Differential Equations",
  journal =      j-TOMS,
  volume =       "47",
  number =       "4",
  pages =        "33:1--33:34",
  month =        dec,
  year =         "2021",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3469720",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Sep 29 06:58:41 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469720",
  abstract =     "This work presents the efficient, matrix-free
                 finite-element library hyper.deal for solving partial
                 differential equations in two up to six dimensions with
                 high-order discontinuous Galerkin methods. It builds
                 upon the low-dimensional finite-element library deal.II
                 to create complex low-dimensional meshes and to operate
                 on them individually. These meshes are combined via a
                 tensor product on the fly, and the library provides new
                 special-purpose highly optimized matrix-free functions
                 exploiting domain decomposition as well as shared
                 memory via MPI-3.0 features. Both node-level
                 performance analyses and strong/weak-scaling studies on
                 up to 147,456 CPU cores confirm the efficiency of the
                 implementation. Results obtained with the library
                 hyper.deal are reported for high-dimensional advection
                 problems and for the solution of the Vlasov--Poisson
                 equation in up to six-dimensional phase space.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Muruganandam:2021:OSR,
  author =       "Paulsamy Muruganandam and Antun Balaz and Sadhan K.
                 Adhikari",
  title =        "\pkg{OpenMP} solver for rotating spin-1 spin-orbit-
                 and {Rabi}-coupled {Bose--Einstein} condensates",
  journal =      j-COMP-PHYS-COMM,
  volume =       "264",
  number =       "??",
  pages =        "Article 107926",
  month =        jul,
  year =         "2021",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2021.107926",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Wed Jun 9 09:57:27 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465521000618",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Nguyen:2021:EMA,
  author =       "Truong Thao Nguyen and Mohamed Wahib and Ryousei
                 Takano",
  title =        "Efficient {MPI-AllReduce} for large-scale deep
                 learning on {GPU-clusters}",
  journal =      j-CCPE,
  volume =       "33",
  number =       "12",
  pages =        "e5574:1--e5574:??",
  day =          "25",
  month =        jun,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5574",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:49:53 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "09 December 2019",
}

@Article{Perepu:2021:OIP,
  author =       "Pavan Kumar Perepu",
  title =        "{OpenMP} Implementation of Parallel Longest Common
                 Subsequence Algorithm for Mathematical Expression
                 Retrieval",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "31",
  number =       "02",
  pages =        "??--??",
  month =        jun,
  year =         "2021",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626421500079",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Thu Feb 17 06:50:36 MST 2022",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626421500079",
  abstract =     "Given a mathematical expression in LaTeX or MathML
                 format, retrieval algorithm extracts similar
                 expressions from a database. In our previous work, we
                 have used Longest Common Subsequence (LCS) algorithm to
                 match two expressions of lengths, m and n , which takes
                 O \( mn \) time complexity. If there are T database
                 expressions, total complexity is O \( Tmn \) , and an
                 increase in T also increases this complexity. In the
                 present work, we propose to use parallel LCS algorithm
                 in our retrieval process. Parallel LCS has O \( \max \(
                 m \, n \) \) time complexity with \max \( m \, n \)
                 processors and total complexity can be reduced to O \(
                 T \max \( m \, n \) \) . For our experimentation,
                 OpenMP based implementation has been used on Intel i3
                 processor with 4 cores. However, for smaller
                 expressions, parallel version takes more time as the
                 implementation overhead dominates the algorithmic
                 improvement. As such, we have proposed to use parallel
                 version, selectively, only on larger expressions, in
                 our retrieval algorithm to achieve better performance.
                 We have compared the sequential and parallel versions
                 of our ME retrieval algorithm, and the performance
                 results have been reported on a database of 829
                 mathematical expressions.",
  acknowledgement = ack-nhfb,
  articleno =    "2150007",
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Pimentel-Garcia:2021:EIP,
  author =       "Ernesto Pimentel-Garc{\'\i}a and Carlos Par{\'e}s and
                 Manuel J. Castro and Julian Koellermeier",
  title =        "On the efficient implementation of {PVM} methods and
                 simple {Riemann} solvers. {Application} to the {Roe}
                 method for large hyperbolic systems",
  journal =      j-APPL-MATH-COMP,
  volume =       "388",
  number =       "??",
  pages =        "Article 125544",
  day =          "1",
  month =        jan,
  year =         "2021",
  CODEN =        "AMHCBQ",
  DOI =          "https://doi.org/10.1016/j.amc.2020.125544",
  ISSN =         "0096-3003 (print), 1873-5649 (electronic)",
  ISSN-L =       "0096-3003",
  bibdate =      "Sat Mar 13 06:39:48 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/applmathcomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0096300320305002",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Mathematics and Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00963003",
}

@Article{Pinho:2021:RTI,
  author =       "Luis Miguel Pinho and Sara Royuela and Eduardo
                 Qui{\~n}ones",
  title =        "Real-time Issues in the {Ada} Parallel Model with
                 {OpenMP}",
  journal =      j-SIGADA-LETTERS,
  volume =       "40",
  number =       "2",
  pages =        "96--102",
  month =        apr,
  year =         "2021",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/3463478.3463491",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "0736-721X",
  bibdate =      "Mon Jun 28 15:50:16 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigada.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3463478.3463491",
  abstract =     "The current proposal for the next revision of the Ada
                 language considers the possibility to map the language
                 parallel features to an underlying OpenMP runtime. As
                 previously presented, and discussed in previous
                 workshops, the works on fine-grain parallelism in Ada
                 map well to the OpenMP tasking model for parallelism.
                 Nevertheless, and although the general model of
                 integration, and the semantic constructs are already
                 reflected in the proposed revision of the standard, the
                 integration of these new features with the Real-Time
                 Systems Annex of Ada is still not complete. This paper
                 presents an overview of what is supported and the still
                 open issues.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGADA Ada Letters",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J32",
}

@Article{Proficz:2021:AGA,
  author =       "Jerzy Proficz",
  title =        "All-gather Algorithms Resilient to Imbalanced Process
                 Arrival Patterns",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "41:1--41:22",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460122",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460122",
  abstract =     "Two novel algorithms for the all-gather operation
                 resilient to imbalanced process arrival patterns (PATs)
                 are presented. The first one, Background Disseminated
                 Ring (BDR), is based on the regular parallel ring
                 algorithm often supplied in MPI implementations and
                 exploits an auxiliary background thread for early data
                 exchange from faster processes to accelerate the
                 performed all-gather operation. The other algorithm,
                 Background Sorted Linear synchronized tree with
                 Broadcast (BSLB), is built upon the already existing
                 PAP-aware gather algorithm, that is, Background Sorted
                 Linear Synchronized tree (BSLS), followed by a regular
                 broadcast distributing gathered data to all
                 participating processes. The background of the
                 imbalanced PAP subject is described, along with the PAP
                 monitoring and evaluation topics. An experimental
                 evaluation of the algorithms based on a proposed
                 mini-benchmark is presented. The mini-benchmark was
                 performed over 2,000 times in a typical HPC cluster
                 architecture with homogeneous compute nodes. The
                 obtained results are analyzed according to different
                 PATs, data sizes, and process numbers, showing that the
                 proposed optimization works well for various
                 configurations, is scalable, and can significantly
                 reduce the all-gather elapsed times, in our case, up to
                 factor 1.9 or 47\% in comparison with the best
                 state-of-the-art solution.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Quaranta:2021:NMH,
  author =       "Lionel Quaranta and Lalith Maddegedara",
  title =        "A novel {MPI+MPI} hybrid approach combining {MPI-3}
                 shared memory windows and {C11\slash C++11} memory
                 model",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "157",
  number =       "??",
  pages =        "125--144",
  month =        nov,
  year =         "2021",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2021.06.008",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Feb 10 06:39:21 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373152100143X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Ramachandran:2021:PPB,
  author =       "Prabhu Ramachandran and Aditya Bhosale and Kunal Puri
                 and Pawan Negi and Abhinav Muta and A. Dinesh and
                 Dileep Menon and Rahul Govind and Suraj Sanka and Amal
                 S. Sebastian and Ananyo Sen and Rohan Kaushik and
                 Anshuman Kumar and Vikas Kurapati and Mrinalgouda Patil
                 and Deep Tavker and Pankaj Pandey and Chandrashekhar
                 Kaushik and Arkopal Dutt and Arpit Agarwal",
  title =        "{PySPH}: a {Python}-based Framework for Smoothed
                 Particle Hydrodynamics",
  journal =      j-TOMS,
  volume =       "47",
  number =       "4",
  pages =        "34:1--34:38",
  month =        dec,
  year =         "2021",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3460773",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Sep 29 06:58:41 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460773",
  abstract =     "PySPH is an open-source, Python-based, framework for
                 particle methods in general and Smoothed Particle
                 Hydrodynamics (SPH) in particular. PySPH allows a user
                 to define a complete SPH simulation using pure Python.
                 High-performance code is generated from this high-level
                 Python code and executed on either multiple cores, or
                 on GPUs, seamlessly. It also supports distributed
                 execution using MPI. PySPH supports a wide variety of
                 SPH schemes and formulations. These include,
                 incompressible and compressible fluid flow, elastic
                 dynamics, rigid body dynamics, shallow water equations,
                 and other problems. PySPH supports a variety of
                 boundary conditions including mirror, periodic, solid
                 wall, and inlet/outlet boundary conditions. The package
                 is written to facilitate reuse and reproducibility.
                 This article discusses the overall design of PySPH and
                 demonstrates many of its features. Several example
                 results are shown to demonstrate the range of features
                 that PySPH provides.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Ramroach:2021:ADP,
  author =       "Sterling Ramroach and Ajay Joshi",
  title =        "Accelerating Data-Parallel Neural Network Training
                 with Weighted-Averaging Reparameterisation",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "31",
  number =       "02",
  pages =        "??--??",
  month =        jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1142/S0129626421500092",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Thu Feb 17 06:50:36 MST 2022",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 http://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626421500092",
  abstract =     "Recent advances in artificial intelligence has shown a
                 direct correlation between the performance of a network
                 and the number of hidden layers within the network. The
                 Compute Unified Device Architecture (CUDA) framework
                 facilitates the movement of heavy computation from the
                 CPU to the graphics processing unit (GPU) and is used
                 to accelerate the training of neural networks. In this
                 paper, we consider the problem of data-parallel neural
                 network training. We compare the performance of
                 training the same neural network on the GPU with and
                 without data parallelism. When data parallelism is
                 used, we compare with both the conventional averaging
                 of coefficients and our proposed method. We set out to
                 show that not all sub-networks are equal and thus,
                 should not be treated as equals when normalising weight
                 vectors. The proposed method achieved state of the art
                 accuracy faster than conventional training along with
                 better classification performance in some cases.",
  acknowledgement = ack-nhfb,
  articleno =    "2150009",
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Reano:2021:RRC,
  author =       "Carlos Rea{\~n}o and Federico Silla",
  title =        "Redesigning the {rCUDA} communication layer for a
                 better adaptation to the underlying hardware",
  journal =      j-CCPE,
  volume =       "33",
  number =       "14",
  pages =        "e5481:1--e5481:??",
  day =          "25",
  month =        jul,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5481",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:49:55 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "19 August 2019",
}

@Article{Rundo:2021:CPM,
  author =       "Leonardo Rundo and Andrea Tangherloni and Marco S.
                 Nobile",
  title =        "A {CUDA}-powered method for the feature extraction and
                 unsupervised analysis of medical images",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "8",
  pages =        "8514--8531",
  month =        aug,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03565-8",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:32 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03565-8",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Schuchart:2021:CBC,
  author =       "Joseph Schuchart and Philipp Samfass and Christoph
                 Niethammer and Jos{\'e} Gracia and George Bosilca",
  title =        "Callback-based completion notification using {MPI}
                 Continuations",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "106",
  number =       "??",
  pages =        "??--??",
  month =        sep,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102793",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:17 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000466",
  acknowledgement = ack-nhfb,
  articleno =    "102793",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Schwarzrock:2021:RNI,
  author =       "J. Schwarzrock and C. C. {de Oliveira} and M. Ritt and
                 A. F. Lorenzon and A. C. S. Beck",
  title =        "A Runtime and Non-Intrusive Approach to Optimize {EDP}
                 by Tuning Threads and {CPU} Frequency for {OpenMP}
                 Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "32",
  number =       "7",
  pages =        "1713--1724",
  year =         "2021",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2020.3046537",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Mar 19 06:51:50 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Sojoodi:2021:IGG,
  author =       "Amir Hossein Sojoodi and Majid Salimi Beni and Farshad
                 Khunjush",
  title =        "{Igniteg-GPU}: a {GPU}-enabled in-memory computing
                 architecture on clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "3",
  pages =        "3165--3192",
  month =        mar,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03390-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:19:59 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03390-z",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 27 July 2020 Pages: 3165 - 3192",
}

@Article{Spiliotis:2021:PCD,
  author =       "Iraklis M. Spiliotis and Charalampos Sitaridis and
                 Michael P. Bekakos",
  title =        "Parallel Computation of Discrete Orthogonal Moment on
                 Block Represented Images Using {OpenMP}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "49",
  number =       "3",
  pages =        "440--462",
  month =        jun,
  year =         "2021",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-021-00713-2",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Fri May 14 08:58:34 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s10766-021-00713-2",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  online-date =  "Published: 15 April 2021 Pages: 440 - 462",
}

@Article{Sun:2021:ACW,
  author =       "J. Sun and N. Guan and J. Sun and X. Zhang and Y. Chi
                 and F. Li",
  title =        "Algorithms for Computing the {WCRT} Bound of {OpenMP}
                 Task Systems With Conditional Branches",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "70",
  number =       "1",
  pages =        "57--71",
  month =        jan,
  year =         "2021",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2020.2984502",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Dec 17 19:35:03 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Taft:2021:LMA,
  author =       "S. Tucker Taft",
  title =        "A Layered Mapping of {Ada 202X} to {OpenMP}",
  journal =      j-SIGADA-LETTERS,
  volume =       "40",
  number =       "2",
  pages =        "55--58",
  month =        apr,
  year =         "2021",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/3463478.3463482",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "0736-721X",
  bibdate =      "Mon Jun 28 15:50:16 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigada.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3463478.3463482",
  abstract =     "The OpenMP specification defines a set of compiler
                 directives, library routines, and environment variables
                 that together represent the OpenMP Application
                 Programming Interface, and is currently defined for C,
                 C++, and Fortran. The forthcoming version of Ada,
                 currently dubbed Ada 202X, includes lightweight
                 parallelism features, in particular parallel blocks and
                 parallel loops. All versions of Ada, since its
                 inception in 1983, have included ``tasking,'' which
                 corresponds to what are traditionally considered
                 ``heavyweight'' parallelism features, or simply
                 ``concurrency'' features. Ada ``tasks'' typically map
                 to what are called ``kernel threads,'' in that the
                 operating system manages them and schedules them.
                 However, one of the goals of lightweight parallelism is
                 to reduce overhead by doing more of the management
                 outside the kernel of the operating system, using a
                 light-weight-thread (LWT) scheduler. The OpenMP library
                 routines support both levels of threading, but for Ada
                 202X, the main interest is in making use of OpenMP for
                 its lightweight thread scheduling capabilities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGADA Ada Letters",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J32",
}

@Article{Takizawa:2021:OLO,
  author =       "Hiroyuki Takizawa and Shinji Shiotsuki and Naoki Ebata
                 and Ryusuke Egawa",
  title =        "{OpenCL}-like offloading with metaprogramming for
                 {SX}-Aurora {TSUBASA}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "102",
  number =       "??",
  pages =        "Article 102754",
  month =        may,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102754",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Mar 29 11:36:03 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000144",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Tanaka:2021:NRP,
  author =       "Ushio Tanaka and Masami Saga and Junji Nakano",
  title =        "\pkg{NScluster}: An {R} Package for Maximum Palm
                 Likelihood Estimation for Cluster Point Process Models
                 Using {OpenMP}",
  journal =      j-J-STAT-SOFT,
  volume =       "98",
  number =       "??",
  pages =        "??--??",
  month =        "????",
  year =         "2021",
  CODEN =        "JSSOBK",
  DOI =          "https://doi.org/10.18637/jss.v98.i06",
  ISSN =         "1548-7660",
  ISSN-L =       "1548-7660",
  bibdate =      "Fri Jul 23 08:12:54 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jstatsoft.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.jstatsoft.org/index.php/jss/article/view/v098i06;
                 https://www.jstatsoft.org/index.php/jss/article/view/v098i06/v98i06.pdf",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.jstatsoft.org/",
}

@Article{Traff:2021:MCC,
  author =       "Jesper Larsson Tr{\"a}ff and Sascha Hunold and
                 Guillaume Mercier and Daniel J. Holmes",
  title =        "{MPI} collective communication through a single set of
                 interfaces: a case for orthogonality",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "107",
  number =       "??",
  pages =        "??--??",
  month =        oct,
  year =         "2021",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102826",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:17 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121000740",
  acknowledgement = ack-nhfb,
  articleno =    "102826",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Wang:2021:ATD,
  author =       "Farui Wang and Weizhe Zhang and Zheng Wang",
  title =        "Automatic translation of data parallel programs for
                 heterogeneous parallelism through {OpenMP} offloading",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "5",
  pages =        "4957--4987",
  month =        may,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03452-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:20:00 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03452-2",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 29 October 2020 Pages: 4957 - 4987",
}

@Article{Wang:2021:PBD,
  author =       "Shao-Chung Wang and Lin-Ya Yu and Li-An Her and
                 Yuan-Shin Hwang and Jenq-Kuen Lee",
  title =        "Pointer-Based Divergence Analysis for {OpenCL 2.0}
                 Programs",
  journal =      j-TOPC,
  volume =       "8",
  number =       "4",
  pages =        "20:1--20:23",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470644",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 10 10:52:35 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470644",
  abstract =     "A modern GPU is designed with many large thread groups
                 to achieve a high throughput and performance. Within
                 these groups, the threads are grouped into fixed-size
                 SIMD batches in which the same instruction is applied
                 to vectors of data in a lockstep. This \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Wang:2021:PBS,
  author =       "Y. Wang and X. Jiang and N. Guan and Z. Guo and X. Liu
                 and W. Yi",
  title =        "Partitioning-Based Scheduling of {OpenMP} Task Systems
                 With Tied Tasks",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "32",
  number =       "6",
  pages =        "1322--1339",
  year =         "2021",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2020.3048373",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Mar 19 06:51:50 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Xue:2021:IFG,
  author =       "Weicheng Xue and Charles W. Jackson and Christoper J.
                 Roy",
  title =        "An improved framework of {GPU} computing for {CFD}
                 applications on structured grids using {OpenACC}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "156",
  number =       "??",
  pages =        "64--85",
  month =        oct,
  year =         "2021",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2021.05.010",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Feb 10 06:39:19 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731521001155",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Xue:2021:MGP,
  author =       "Weicheng Xue and Christoper J. Roy",
  title =        "Multi-{GPU} performance optimization of a
                 computational fluid dynamics code using {OpenACC}",
  journal =      j-CCPE,
  volume =       "33",
  number =       "5",
  pages =        "e6036:1--e6036:??",
  day =          "10",
  month =        mar,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.6036",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue May 18 08:31:21 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "28 September 2020",
}

@Article{Yang:2021:HMC,
  author =       "Sheng-Chun Yang and Yong-Lei Wang",
  title =        "A hybrid {MPI-CUDA} approach for nonequispaced
                 discrete {Fourier} transformation",
  journal =      j-COMP-PHYS-COMM,
  volume =       "258",
  number =       "??",
  pages =        "Article 107513",
  month =        jan,
  year =         "2021",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107513",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Mar 13 08:21:40 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465520302393",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Yang:2021:SSG,
  author =       "Lishan Yang and Bin Nie and Adwait Jog and Evgenia
                 Smirni",
  title =        "{SUGAR}: Speeding Up {GPGPU} Application Resilience
                 Estimation with Input Sizing",
  journal =      j-POMACS,
  volume =       "5",
  number =       "1",
  pages =        "01:1--01:29",
  month =        feb,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447375",
  ISSN =         "2476-1249",
  ISSN-L =       "2476-1249",
  bibdate =      "Mon Mar 29 10:31:36 MDT 2021",
  bibsource =    "http://portal.acm.org/http://www.math.utah.edu/pub/tex/bib/pomacs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447375",
  abstract =     "As Graphics Processing Units (GPUs) are becoming a de
                 facto solution for accelerating a wide range of
                 applications, their reliable operation is becoming
                 increasingly important. One of the major challenges in
                 the domain of GPU reliability is to \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "01",
  fjournal =     "Proceedings of the ACM on Measurement and Analysis of
                 Computing Systems (POMACS)",
  journal-URL =  "https://dl.acm.org/loi/pomacs",
}

@Article{Zhang:2021:IRP,
  author =       "Jingrong Zhang and Zihao Wang and Zhiyong Liu and Fa
                 Zhang",
  title =        "Improve the Resolution and Parallel Performance of the
                 Three-Dimensional Refine Algorithm in {RELION} Using
                 {CUDA} and {MPI}",
  journal =      j-TCBB,
  volume =       "18",
  number =       "2",
  pages =        "583--595",
  month =        mar,
  year =         "2021",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2019.2929171",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Mar 4 08:29:16 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  URL =          "https://dl.acm.org/doi/10.1109/TCBB.2019.2929171",
  abstract =     "In cryo-electron microscopy, RELION is a powerful tool
                 for high-resolution reconstruction. Due to the
                 complicated imaging procedure and the heterogeneity of
                 particles, some of the selected particle images offer
                 more disturbing information than others. \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "https://dl.acm.org/loi/tcbb",
}

@Article{Zhou:2021:HPG,
  author =       "Chao Zhou",
  title =        "High Performance Graph Data Imputation on Multiple
                 {GPUs}",
  journal =      j-FUTURE-INTERNET,
  volume =       "13",
  number =       "2",
  pages =        "36",
  day =          "31",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.3390/fi13020036",
  ISSN =         "1999-5903",
  bibdate =      "Fri Feb 26 10:54:58 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/future-internet.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://www.mdpi.com/1999-5903/13/2/36",
  abstract =     "In real applications, massive data with graph
                 structures are often incomplete due to various
                 restrictions. Therefore, graph data imputation
                 algorithms have been widely used in the fields of
                 social networks, sensor networks, and MRI to solve the
                 graph data completion problem. To keep the data
                 relevant, a data structure is represented by a
                 graph-tensor, in which each matrix is the vertex value
                 of a weighted graph. The convolutional imputation
                 algorithm has been proposed to solve the low-rank
                 graph-tensor completion problem that some data matrices
                 are entirely unobserved. However, this data imputation
                 algorithm has limited application scope because it is
                 compute-intensive and low-performance on CPU. In this
                 paper, we propose a scheme to perform the convolutional
                 imputation algorithm with higher time performance on
                 GPUs (Graphics Processing Units) by exploiting
                 multi-core GPUs of CUDA architecture. We propose
                 optimization strategies to achieve coalesced memory
                 access for graph Fourier transform (GFT) computation
                 and improve the utilization of GPU SM resources for
                 singular value decomposition (SVD) computation.
                 Furthermore, we design a scheme to extend the
                 GPU-optimized implementation to multiple GPUs for
                 large-scale computing. Experimental results show that
                 the GPU implementation is both fast and accurate. On
                 synthetic data of varying sizes, the GPU-optimized
                 implementation running on a single Quadro RTX6000 GPU
                 achieves up to 60.50$ \times $ speedups over the
                 GPU-baseline implementation. The multi-GPU
                 implementation achieves up to 1.81$ \times $ speedups
                 on two GPUs versus the GPU-optimized implementation on
                 a single GPU. On the ego-Facebook dataset, the
                 GPU-optimized implementation achieves up to 77.88$
                 \times $ speedups over the GPU-baseline implementation.
                 Meanwhile, the GPU implementation and the CPU
                 implementation achieve similar, low recovery errors.",
  acknowledgement = ack-nhfb,
  journal-URL =  "https://www.mdpi.com/1999-5903/",
  remark =       "Section Smart System Infrastructure and
                 Applications.",
}

@Article{Zhu:2021:POT,
  author =       "Zijie Zhu and Yongxian Wang and Xinghua Cheng",
  title =        "Parallel optimization of three-dimensional
                 wedge-shaped underwater acoustic propagation based on
                 {MPI + OpenMP} hybrid programming model",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "5",
  pages =        "4988--5018",
  month =        may,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-020-03466-w",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Fri May 14 09:20:00 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-020-03466-w",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  online-date =  "Published: 29 October 2020 Pages: 4988 - 5018",
}

@Article{Agathos:2022:CAA,
  author =       "Spiros N. Agathos and Vassilios V. Dimakopoulos and
                 Ilias K. Kasmeridis",
  title =        "Compiler-assisted, adaptive runtime system for the
                 support of {OpenMP} in embedded multicores",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "110",
  number =       "??",
  pages =        "??--??",
  month =        may,
  year =         "2022",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2022.102895",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:18 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819122000035",
  acknowledgement = ack-nhfb,
  articleno =    "102895",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Bak:2022:OAE,
  author =       "Seonmyeong Bak and Colleen Bertoni and Swen Boehm and
                 Reuben Budiardja and Barbara M. Chapman and Johannes
                 Doerfert and Markus Eisenbach and Hal Finkel and Oscar
                 Hernandez and Joseph Huber and Shintaro Iwasaki and
                 Vivek Kale and Paul R. C. Kent and JaeHyuk Kwack and
                 Meifeng Lin and Piotr Luszczek and Ye Luo and Buu Pham
                 and Swaroop Pophale and Kiran Ravikumar and Vivek
                 Sarkar and Thomas Scogland and Shilei Tian and P. K.
                 Yeung",
  title =        "{OpenMP} application experiences: Porting to
                 accelerated nodes",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "109",
  number =       "??",
  pages =        "??--??",
  month =        mar,
  year =         "2022",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102856",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:18 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121001009",
  acknowledgement = ack-nhfb,
  articleno =    "102856",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Barai:2022:PMP,
  author =       "Atanu Barai and Yehia Arafa and Stephan Eidenbenz",
  title =        "\pkg{PPT-Multicore}: performance prediction of
                 {OpenMP} applications using reuse profiles and
                 analytical modeling",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "78",
  number =       "2",
  pages =        "2354--2385",
  month =        feb,
  year =         "2022",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03949-4",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:34 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03949-4",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Bouhrour:2022:TLC,
  author =       "Stephane Bouhrour and Thibaut Pepin and Julien
                 Jaeger",
  title =        "Towards leveraging collective performance with the
                 support of {MPI 4.0} features in {MPC}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "109",
  number =       "??",
  pages =        "??--??",
  month =        mar,
  year =         "2022",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102860",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:18 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121001034",
  acknowledgement = ack-nhfb,
  articleno =    "102860",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Delmas:2022:MGI,
  author =       "Vincent Delmas and Azzedine Soula{\"\i}mani",
  title =        "Multi-{GPU} implementation of a time-explicit finite
                 volume solver using {CUDA} and a {CUDA}-Aware version
                 of {OpenMPI} with application to shallow water flows",
  journal =      j-COMP-PHYS-COMM,
  volume =       "271",
  number =       "??",
  pages =        "Article 108190",
  month =        feb,
  year =         "2022",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2021.108190",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Dec 20 16:41:52 MST 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465521003027",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Dichev:2022:PLR,
  author =       "Kiril Dichev and Daniele {De Sensi} and Dimitrios S.
                 Nikolopoulos and Kirk W. Cameron and Ivor Spence",
  title =        "{Power Log n Roll}: Power-Efficient Localized Rollback
                 for {MPI} Applications Using Message Logging
                 Protocols",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "33",
  number =       "6",
  pages =        "1276--1288",
  month =        jun,
  year =         "2022",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2021.3107745",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Nov 9 11:11:37 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Du:2022:MPO,
  author =       "Qi Du and Hui Huang",
  title =        "{MPI} parameter optimization during debugging phase of
                 {HPC} system",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "78",
  number =       "2",
  pages =        "1696--1711",
  month =        feb,
  year =         "2022",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03939-6",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:34 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03939-6",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Gonzalez-Dominguez:2022:MDP,
  author =       "Jorge Gonz{\'a}lez-Dom{\'\i}nguez and Jos{\'e} M.
                 Mart{\'\i}n-Mart{\'\i}nez and Roberto R. Exp{\'o}sito",
  title =        "\pkg{MPI-dot2dot}: A parallel tool to find {DNA}
                 tandem repeats on multicore clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "78",
  number =       "3",
  pages =        "4217--4235",
  month =        feb,
  year =         "2022",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-04025-7",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:34 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-04025-7",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Haghi:2022:RSH,
  author =       "Pouya Haghi and Anqi Guo and Qingqing Xiong and Chen
                 Yang and Tong Geng and Justin T. Broaddus and Ryan
                 Marshall and Derek Schafer and Anthony Skjellum and
                 Martin C. Herbordt",
  title =        "Reconfigurable switches for high performance and
                 flexible {MPI} collectives",
  journal =      j-CCPE,
  volume =       "34",
  number =       "6",
  pages =        "e6769:1--e6769:??",
  day =          "10",
  month =        mar,
  year =         "2022",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.6769",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:50:09 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 December 2021",
}

@Article{Huckelheim:2022:SSA,
  author =       "Jan H{\"u}ckelheim and Laurent Hasco{\"e}t",
  title =        "Source-to-Source Automatic Differentiation of {OpenMP}
                 Parallel Loops",
  journal =      j-TOMS,
  volume =       "48",
  number =       "1",
  pages =        "7:1--7:32",
  month =        mar,
  year =         "2022",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3472796",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Feb 17 08:00:57 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472796",
  abstract =     "differentiation of OpenMP parallel worksharing loops
                 in forward and reverse mode. Automatic differentiation
                 is a method to obtain gradients of numerical programs,
                 which are crucial in optimization, uncertainty
                 quantification, and machine learning. The computational
                 cost to compute gradients is a common bottleneck in
                 practice. For applications that are parallelized for
                 multicore CPUs or GPUs using OpenMP, one also wishes to
                 compute the gradients in parallel. We propose a
                 framework to reason about the correctness of the
                 generated derivative code, from which we justify our
                 OpenMP extension to the differentiation model. We
                 implement this model in the automatic differentiation
                 tool Tapenade and present test cases that are
                 differentiated following our extended differentiation
                 procedure. Performance of the generated derivative
                 programs in forward and reverse mode is better than
                 sequential, although our reverse mode often scales
                 worse than the input programs.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Jani:2022:HST,
  author =       "Kunal Jani and Ankit Kumar and Ronak Nahata",
  title =        "\pkg{Hpcfolder}: a simple tool used to parallelize
                 algorithms using the message passing interface
                 {(MPI)}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "78",
  number =       "1",
  pages =        "258--278",
  month =        jan,
  year =         "2022",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03896-0",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:33 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03896-0",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Janssen:2022:GPU,
  author =       "Dylan M. Janssen and Wayne Pullan and Alan Wee-Chung
                 Liew",
  title =        "Graphics processing unit acceleration of the island
                 model genetic algorithm using the {CUDA} programming
                 platform",
  journal =      j-CCPE,
  volume =       "34",
  number =       "2",
  pages =        "e6286:1--e6286:??",
  day =          "25",
  month =        jan,
  year =         "2022",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.6286",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:50:05 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "31 March 2021",
}

@Article{Li:2022:CDC,
  author =       "Wentao Li and Zhiwen Chen and Xin He and Guoyun Duan
                 and Jianhua Sun and Hao Chen",
  title =        "{CVFuzz}: Detecting complexity vulnerabilities in
                 {OpenCL} kernels via automated pathological input
                 generation",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "127",
  number =       "??",
  pages =        "384--395",
  month =        feb,
  year =         "2022",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2021.09.006",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 9 09:07:25 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X21003526",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Meyer:2022:DFA,
  author =       "Marius Meyer and Tobias Kenter and Christian Plessl",
  title =        "In-depth {FPGA} accelerator performance evaluation
                 with single node benchmarks from the {HPC} challenge
                 benchmark suite for {Intel} and {Xilinx} {FPGAs} using
                 {OpenCL}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "160",
  number =       "??",
  pages =        "79--89",
  month =        feb,
  year =         "2022",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2021.10.007",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Feb 10 06:39:24 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731521002057",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Protze:2022:MDT,
  author =       "Joachim Protze and Marc-Andr{\'e} Hermanns and
                 Matthias S. M{\"u}ller and Van Man Nguyen and Julien
                 Jaeger and Emmanuelle Saillard and Patrick Carribault
                 and Denis Barthou",
  title =        "{MPI} detach --- Towards automatic asynchronous local
                 completion",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "109",
  number =       "??",
  pages =        "??--??",
  month =        mar,
  year =         "2022",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102859",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:18 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121001022",
  acknowledgement = ack-nhfb,
  articleno =    "102859",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Raskovalov:2022:AMD,
  author =       "Anton Raskovalov and Platon Surkov",
  title =        "{azTotMD 2.0}: {Molecular} dynamics with the radiative
                 thermostat and temperature-dependent force field
                 ({CUDA} version)",
  journal =      j-SOFTWAREX,
  volume =       "17",
  number =       "??",
  pages =        "??--??",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1016/j.softx.2022.100995",
  ISSN =         "2352-7110",
  ISSN-L =       "2352-7110",
  bibdate =      "Mon Feb 28 10:41:25 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/softwarex.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S2352711022000127",
  acknowledgement = ack-nhfb,
  articleno =    "100995",
  fjournal =     "SoftwareX",
  journal-URL =  "https://www.sciencedirect.com/journal/softwarex/issues",
}

@Article{Rocco:2022:LFR,
  author =       "Roberto Rocco and Davide Gadioli and Gianluca
                 Palermo",
  title =        "\pkg{Legio}: fault resiliency for embarrassingly
                 parallel {MPI} applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "78",
  number =       "2",
  pages =        "2175--2195",
  month =        feb,
  year =         "2022",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03951-w",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:34 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03951-w",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Smith:2022:PAM,
  author =       "Matthew Smith and Arjen Tamerus and Phil Hasnip",
  title =        "Portable Acceleration of Materials Modeling Software:
                 {CASTEP}, {GPUs}, and {OpenACC}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "24",
  number =       "1",
  pages =        "46--55",
  month =        jan # "\slash " # feb,
  year =         "2022",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2022.3141714",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Thu Mar 17 07:23:22 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Zhao:2022:SGM,
  author =       "Chen Zhao and Wu Gao and Feiping Nie and Huiyang
                 Zhou",
  title =        "A Survey of {GPU} Multitasking Methods Supported by
                 Hardware Architecture",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "33",
  number =       "6",
  pages =        "1451--1463",
  month =        jun,
  year =         "2022",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2021.3115630",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Tue Nov 9 11:11:37 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Zhong:2022:ULV,
  author =       "Dong Zhong and Qinglei Cao and George Bosilca and Jack
                 Dongarra",
  title =        "Using long vector extensions for {MPI} reductions",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "109",
  number =       "??",
  pages =        "??--??",
  month =        mar,
  year =         "2022",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2021.102871",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 18 10:07:18 MST 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819121001137",
  acknowledgement = ack-nhfb,
  articleno =    "102871",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

%%% ====================================================================
%%% Cross-referenced entries must come last; entries are sorted by year,
%%% and then by citation label, with `bibsort --byyear':

@Proceedings{Anonymous:1989:PFC,
  editor =       "Anonymous",
  booktitle =    "{Proceedings of the Fourth Conference on Hypercubes,
                 Concurrent Computers and Applications, 6--8 March 1989,
                 Monterey, CA, USA}",
  title =        "{Proceedings of the Fourth Conference on Hypercubes,
                 Concurrent Computers and Applications, 6--8 March 1989,
                 Monterey, CA, USA}",
  publisher =    "Golden Gate Enterprises",
  address =      "Los Altos, CA, USA",
  pages =        "xiv + 1362",
  year =         "1989",
  ISBN =         "",
  ISBN-13 =      "",
  LCCN =         "QA76.5.C619215 1989",
  bibdate =      "Sun Dec 22 10:16:53 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes",
  acknowledgement = ack-nhfb,
  confsponsor =  "D.O.E.; US Air Force; NASA",
}

@Proceedings{ACM:1990:PAC,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 1990 ACM Conference on LISP and
                 Functional Programming: papers presented at the
                 conference, Nice, France, June 27--29, 1990}",
  title =        "{Proceedings of the 1990 ACM Conference on LISP and
                 Functional Programming: papers presented at the
                 conference, Nice, France, June 27--29, 1990}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "viii + 348",
  year =         "1990",
  ISBN =         "0-89791-368-X",
  ISBN-13 =      "978-0-89791-368-3",
  LCCN =         "QA 76.73 L23 A24 1990",
  bibdate =      "Wed Apr 16 07:21:40 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM order no. 552900.",
  acknowledgement = ack-nhfb,
  confsponsor =  "ACM",
}

@Proceedings{Bhavsar:1991:SSJ,
  editor =       "Virendrakumar Chhabulal Bhavsar and Uday Govinddas
                 Gujar",
  booktitle =    "{Supercomputing Symposium '91, June 3--5, 1991,
                 Fredericton, NB, Canada: symposium proceedings}",
  title =        "{Supercomputing Symposium '91, June 3--5, 1991,
                 Fredericton, NB, Canada: symposium proceedings}",
  publisher =    "University of New Brunswick Press",
  address =      "Fredericton, NB, Canada",
  pages =        "x + 544",
  year =         "1991",
  ISBN =         "0-920114-14-8",
  ISBN-13 =      "978-0-920114-14-8",
  LCCN =         "QA76.88.S87 1991",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  corpsource =   "Centre for Dev. of Adv. Comput., Bangalore, India",
  pubcountry =   "Canada",
  treatment =    "P Practical",
}

@Proceedings{Durand:1991:HPC,
  editor =       "M. Durand and F. {El Dabaghi}",
  booktitle =    "{High performance computing, II: proceedings of the
                 Second Symposium on High Performance Computing,
                 Montpellier, France, 7--9 October, 1991}",
  title =        "{High performance computing, II: proceedings of the
                 Second Symposium on High Performance Computing,
                 Montpellier, France, 7--9 October, 1991}",
  publisher =    pub-NH,
  address =      pub-NH:adr,
  pages =        "xii + 673",
  year =         "1991",
  ISBN =         "0-444-89224-9",
  ISBN-13 =      "978-0-444-89224-9",
  LCCN =         "QA75.5.I585 1991",
  bibdate =      "Sun Dec 22 10:17:16 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  numericalindex = "Computer speed 2.0E+07 to 6.0E+07 FLOPS",
  pubcountry =   "Netherlands",
}

@Proceedings{IEEE:1991:PSA,
  editor =       "{IEEE}",
  key =          "Supercomputing '91",
  booktitle =    "{Proceedings, Supercomputing '91: Albuquerque, New
                 Mexico, November 18--22, 1991}",
  title =        "{Proceedings, Supercomputing '91: Albuquerque, New
                 Mexico, November 18--22, 1991}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxiii + 917",
  year =         "1991",
  ISBN =         "0-8186-9158-1 (IEEE: case), 0-8186-2158-3 (IEEE:
                 paper), 0-8186-6158-5 (IEEE: microfiche), 0-89791-459-7
                 (ACM)",
  ISBN-13 =      "978-0-8186-9158-4 (IEEE: case), 978-0-8186-2158-1
                 (IEEE: paper), 978-0-8186-6158-7 (IEEE: microfiche),
                 978-0-89791-459-8 (ACM)",
  LCCN =         "QA76.5 .S894 1991",
  bibdate =      "Mon Jan 15 11:05:59 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 91CH3058-5.",
}

@Proceedings{Stout:1991:SDM,
  editor =       "Quentin F. Stout and Michael Joseph Wolfe",
  booktitle =    "{The Sixth Distributed Memory Computing Conference
                 proceedings April 28--May 1, 1991, Portland, Oregon}",
  title =        "{The Sixth Distributed Memory Computing Conference
                 proceedings April 28--May 1, 1991, Portland, Oregon}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xx + 736",
  year =         "1991",
  ISBN =         "0-8186-2291-1",
  ISBN-13 =      "978-0-8186-2291-5",
  LCCN =         "QA76.5 .D58 1991",
  bibdate =      "Tue Jan 16 07:21:24 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1992:PSE,
  editor =       "Anonymous",
  booktitle =    "{Proceedings SHARE Europe Anniversary Meeting}",
  title =        "{Proceedings SHARE Europe Anniversary Meeting}",
  publisher =    "SHARE Eur. Assoc",
  address =      "Geneva, Switzerland",
  pages =        "752",
  year =         "1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "28 Sept.--2 Oct. 1992",
  conflocation = "Davos, Switzerland",
  pubcountry =   "Switzerland",
}

@Proceedings{Dongarra:1992:PFS,
  editor =       "J. Dongarra and P. Kennedy and P. Messina and D. C.
                 Sorensen and R. G. Voigt",
  booktitle =    "{Proceedings of the Fifth SIAM Conference on Parallel
                 Processing for Scientific Computing, 25--27 March 1991,
                 Houston, TX, USA}",
  title =        "{Proceedings of the Fifth SIAM Conference on Parallel
                 Processing for Scientific Computing, 25--27 March 1991,
                 Houston, TX, USA}",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "xvii + 648",
  year =         "1992",
  ISBN =         "0-89871-303-X",
  ISBN-13 =      "978-0-89871-303-9",
  LCCN =         "QA76.58.P76 1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confsponsor =  "SIAM",
}

@Proceedings{Evans:1992:PCP,
  editor =       "D. J. Evans and G. R. Joubert and H. Liddell",
  booktitle =    "{Parallel computing '91: proceedings of the
                 International Conference on Parallel Computing '91,
                 London, UK, 3--6 September 1991}",
  title =        "{Parallel computing '91: proceedings of the
                 International Conference on Parallel Computing '91,
                 London, UK, 3--6 September 1991}",
  volume =       "4",
  publisher =    pub-NH,
  address =      pub-NH:adr,
  pages =        "xi + 628",
  year =         "1992",
  ISBN =         "0-444-89212-5",
  ISBN-13 =      "978-0-444-89212-6",
  LCCN =         "QA76.58.I545 1991",
  bibdate =      "Sun Dec 22 10:17:16 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Advances in parallel computing",
  acknowledgement = ack-nhfb,
  confsponsor =  "Elsevier Sci. Publishers; Maspar Comput. Corp.; NCUBE;
                 Office Naval Res. Eur. Office; Transtech",
  numericalindex = "Byte rate 6.0E+06 Byte/s",
  pubcountry =   "Netherlands",
}

@Proceedings{Ferenczi:1992:AHW,
  editor =       "S. Ferenczi",
  booktitle =    "{1st Austrian-Hungarian Workshop on Transporter
                 Applications. Proceedings}",
  title =        "{1st Austrian-Hungarian Workshop on Transporter
                 Applications. Proceedings}",
  publisher =    "Hungarian Acad.of Sci",
  address =      "Budapest, Hungary",
  pages =        "v + 117",
  year =         "1992",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "8--10 Oct. 1992",
  conflocation = "Sopron, Hungary",
  pubcountry =   "Hungary",
}

@Proceedings{IEEE:1992:PSH,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings / Scalable High Performance Computing
                 Conference, SHPCC-92, April 26--29, 1992, Williamsburg,
                 Virginia}",
  title =        "{Proceedings / Scalable High Performance Computing
                 Conference, SHPCC-92, April 26--29, 1992, Williamsburg,
                 Virginia}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 448",
  year =         "1992",
  ISBN =         "0-8186-2775-1",
  ISBN-13 =      "978-0-8186-2775-0",
  LCCN =         "QA76.76.A65S33 1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 92TH0432-5.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE",
}

@Proceedings{Russell:1992:CMW,
  editor =       "Thomas F. Russell and others",
  booktitle =    "{Computational methods in water resources IX:
                 Proceedings of the Ninth International Conference on
                 Computational Methods in Water Resources, held at the
                 University of Colorado, Denver, in June 1992}",
  title =        "{Computational methods in water resources IX:
                 Proceedings of the Ninth International Conference on
                 Computational Methods in Water Resources, held at the
                 University of Colorado, Denver, in June 1992}",
  publisher =    pub-ELSAS,
  address =      pub-ELSAS:adr,
  pages =        "various",
  year =         "1992",
  ISBN =         "1-85166-871-3 (set), 1-85312-169-X (set: Computational
                 Mechanics Publications, Southampton), 1-56252-098-9
                 (set: Computational Mechanics Publications, Boston),
                 1-85166-791-1 (v. 1: Elsevier Applied Science),
                 1-85312-197-5 (v. 1: Computational Mechanics
                 Publications, Southampton), 1-56252-123-3 (v. 1:
                 Computational Mechanics Publications, New York),
                 1-85166-870-5 (v. 2), 1-85312-198-3 (v. 2),
                 1-56252-124-1 (v. 2)",
  ISBN-13 =      "978-1-85166-871-7 (set), 978-1-85312-169-2 (set:
                 Computational Mechanics Publications, Southampton),
                 978-1-56252-098-4 (set: Computational Mechanics
                 Publications, Boston), 978-1-85166-791-8 (v. 1:
                 Elsevier Applied Science), 978-1-85312-197-5 (v. 1:
                 Computational Mechanics Publications, Southampton),
                 978-1-56252-123-3 (v. 1: Computational Mechanics
                 Publications, New York), 978-1-85166-870-0 (v. 2),
                 978-1-85312-198-2 (v. 2), 978-1-56252-124-0 (v. 2)",
  LCCN =         "GB656.2.E42 C65 1992 v.1-2 (c1992)",
  bibdate =      "Mon Jan 15 18:04:49 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  acknowledgement = ack-nhfb,
}

@Proceedings{SCRI:1992:PWC,
  key =          "SCRI WCC'92",
  booktitle =    "{Proceedings of the Workshop on Cluster Computing}",
  title =        "{Proceedings of the Workshop on Cluster Computing}",
  publisher =    pub-SCRI,
  address =      pub-SCRI:adr,
  pages =        "??",
  month =        dec,
  year =         "1992",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Tue Jan 16 07:34:08 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Proceedings available via anonymous ftp from
                 \path=ftp.scri.fsu.edu= in directory
                 \path=pub/parallel-workshop.92=.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Siegel:1992:FFS,
  editor =       "H. J. Siegel",
  booktitle =    "{Frontiers '92, the Fourth Symposium on the Frontiers
                 of Massive Parallel Computation, October 19--21, 1992,
                 McLean, Virginia}",
  title =        "{Frontiers '92, the Fourth Symposium on the Frontiers
                 of Massive Parallel Computation, October 19--21, 1992,
                 McLean, Virginia}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 592",
  year =         "1992",
  ISBN =         "0-8186-2772-7",
  ISBN-13 =      "978-0-8186-2772-9",
  LCCN =         "QA76.58.S95 1992",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 92CH3185-6.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE; NASA",
}

@Proceedings{Siegel:1992:FSF,
  editor =       "H. J. Siegel",
  booktitle =    "{The Fourth Symposium on the Frontiers of Massively
                 Parallel Computation: Frontiers '92 / October 19--21,
                 1992, McLean Virginia}",
  title =        "{The Fourth Symposium on the Frontiers of Massively
                 Parallel Computation: Frontiers '92 / October 19--21,
                 1992, McLean Virginia}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 592",
  year =         "1992",
  ISBN =         "0-8186-2772-7",
  ISBN-13 =      "978-0-8186-2772-9",
  LCCN =         "QA76.58.S95 1992",
  bibdate =      "Wed Apr 16 07:25:17 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 92CH3185-6.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Verkerk:1992:PIC,
  editor =       "C. Verkerk and W. Wojcik",
  booktitle =    "{Proceedings of the International Conference on
                 Computing in High Energy Physics '92, Annecy, France,
                 21--25 September 1992}",
  title =        "{Proceedings of the International Conference on
                 Computing in High Energy Physics '92, Annecy, France,
                 21--25 September 1992}",
  publisher =    "CERN",
  address =      "Geneve, Switzerland",
  pages =        "xxiii + 916",
  year =         "1992",
  ISBN =         "92-9083-049-2",
  ISBN-13 =      "978-92-9083-049-8",
  LCCN =         "QC783.3 C65 1992",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "CERN report 92-07.",
  acknowledgement = ack-nhfb,
  pubcountry =   "Switzerland",
}

@Proceedings{Anonymous:1993:ATA,
  editor =       "Anonymous",
  booktitle =    "{Automotive technology and automation: Supercomputer
                 applications in the automotive industries: 26th
                 International symposium --- September 1993, Aachen,
                 Germany}",
  title =        "{Automotive technology and automation: Supercomputer
                 applications in the automotive industries: 26th
                 International symposium --- September 1993, Aachen,
                 Germany}",
  publisher =    "Automotive Automation Ltd",
  address =      "Croydon, UK",
  pages =        "????",
  year =         "1993",
  ISBN =         "0-947719-62-8",
  ISBN-13 =      "978-0-947719-62-3",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "ISATA --- Proceedings --- 26th",
  acknowledgement = ack-nhfb,
  sponsor =      "ISATA. ENEA; Agency: Italy.",
}

@Proceedings{Anonymous:1993:CDP,
  editor =       "Anonymous",
  booktitle =    "{The commercial dimensions of parallel computing:
                 UNICOM seminar --- April 1993, London}",
  title =        "{The commercial dimensions of parallel computing:
                 UNICOM seminar --- April 1993, London}",
  publisher =    "Unicom Seminars Ltd",
  address =      "????",
  pages =        "????",
  year =         "1993",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1993:ISA,
  editor =       "Anonymous",
  booktitle =    "{International section: Annual conference ---
                 September 1993, Gallipoli, Italy}",
  title =        "{International section: Annual conference ---
                 September 1993, Gallipoli, Italy}",
  publisher =    "AICA",
  address =      "????",
  pages =        "????",
  year =         "1993",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Atti del Congresso Annuale --- Associazione Italiana
                 per l'Informatica ed il Calcolo Automatico 1993",
  acknowledgement = ack-nhfb,
  sponsor =      "Italian Association for Informatics and Automatic
                 Computation.",
}

@Proceedings{Anonymous:1993:JFI,
  editor =       "Anonymous",
  booktitle =    "{Joint framework for information technology: Technical
                 conference --- March 1993, Keele}",
  title =        "{Joint framework for information technology: Technical
                 conference --- March 1993, Keele}",
  publisher =    "Dept. of Trade and Industry, Information and
                 Manufacturing Division",
  address =      "London, UK",
  pages =        "????",
  year =         "1993",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "JFIT Technical Conference Digest",
  acknowledgement = ack-nhfb,
  sponsor =      "Great Britain; Department of Trade and Industry.
                 Science and Engineering Research Council.",
}

@Proceedings{Anonymous:1993:PSE,
  editor =       "Anonymous",
  booktitle =    "{Proceedings. SHARE Europe Anniversary Meeting.
                 Client/Server --- the Promise and the Reality: October
                 25--28, 1993, the Hague, the Netherlands}",
  title =        "{Proceedings. SHARE Europe Anniversary Meeting.
                 Client/Server --- the Promise and the Reality: October
                 25--28, 1993, the Hague, the Netherlands}",
  publisher =    "SHARE Europe",
  address =      "Geneva, Switzerland",
  pages =        "xxi + 1002",
  year =         "1993",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0254-6213",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 11:45:17 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1993:SEC,
  editor =       "Anonymous",
  booktitle =    "{Supercomputing Europe '93. Conference Papers}",
  title =        "{Supercomputing Europe '93. Conference Papers}",
  publisher =    "Royal Dutch Fairs",
  address =      "Utrecht, Netherlands",
  pages =        "251",
  year =         "1993",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C7300 (Natural
                 sciences); C7400 (Engineering)",
  confdate =     "22--24 Feb. 1993",
  conflocation = "Utrecht, Netherlands",
  keywords =     "Aerospace applications; High Performance Fortran;
                 Parallel architectures; Parallel software; Scientific
                 applications; Scientific visualisation; Superconducting
                 environments; Workstation clusters",
  pubcountry =   "Netherlands",
  thesaurus =    "Engineering computing; Natural sciences computing;
                 Parallel architectures; Parallel processing; Software
                 engineering",
}

@Proceedings{Bhargava:1993:PIW,
  editor =       "Bharat Bhargava",
  booktitle =    "{Proceedings of the IEEE Workshop on Advances in
                 Parallel and Distributed Systems, October 6, 1993,
                 Princeton, New Jersey}",
  title =        "{Proceedings of the IEEE Workshop on Advances in
                 Parallel and Distributed Systems, October 6, 1993,
                 Princeton, New Jersey}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "viii + 170",
  year =         "1993",
  ISBN =         "0-8186-5250-0, 0-8186-5251-9",
  ISBN-13 =      "978-0-8186-5250-9, 978-0-8186-5251-6",
  LCCN =         "QA76.58.I444 1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE. Computer Society. Technical Committee on
                 Distributed Processing.",
}

@Proceedings{Brebbia:1993:ASE,
  editor =       "C. A. Brebbia and H. Power",
  booktitle =    "{Applications of Supercomputers in Engineering III,
                 27--29 September 1993, Bath, UK}",
  title =        "{Applications of Supercomputers in Engineering III,
                 27--29 September 1993, Bath, UK}",
  publisher =    "Computational Mechanics Publication",
  address =      "London, UK",
  pages =        "561",
  year =         "1993",
  ISBN =         "1-85312-236-X",
  ISBN-13 =      "978-1-85312-236-1",
  LCCN =         "TA345.I556 1993",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Gawman:1993:PCT,
  editor =       "Ann Gawman and W. Morven Gentleman and E. Kidd and
                 Per-{\AA}ke Larson and J. Slonim",
  booktitle =    "{Proceedings CASCON '93: Toronto, Ontario, Canada,
                 24--28 October 1993}",
  title =        "{Proceedings CASCON '93: Toronto, Ontario, Canada,
                 24--28 October 1993}",
  publisher =    "Nat. Res. Council of Canada",
  address =      "Ottawa, Ont., Canada",
  pages =        "xx + 1180",
  year =         "1993",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "QA76.76.S64 C378 1993 v.1-2",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  acknowledgement = ack-nhfb,
  pubcountry =   "Canada",
}

@Proceedings{Grebe:1993:TAS,
  editor =       "R. Grebe and J. Hektor and S. C. Hilton and M. R. Jane
                 and P. H. Welch",
  booktitle =    "{Transputer applications and systems '93: proceedings
                 of the 1993 World Transputer Congress, 20--22 September
                 1993, Aachen, Germany}",
  title =        "{Transputer applications and systems '93: proceedings
                 of the 1993 World Transputer Congress, 20--22 September
                 1993, Aachen, Germany}",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "1317",
  year =         "1993",
  ISBN =         "90-5199-140-1",
  ISBN-13 =      "978-90-5199-140-6",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 11:39:32 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "20--22 Sept. 1993",
  conflocation = "Aachen, Germany",
  pubcountry =   "Netherlands",
}

@Proceedings{Hoffmann:1993:PFE,
  editor =       "Geerd-R. Hoffmann and Tuomo Kauranne",
  booktitle =    "{Proceedings of the Fifth ECMWF Workshop on the Use of
                 Parallel Processors in Meteorology. Parallel
                 Supercomputing in Atmospheric Science}",
  title =        "{Proceedings of the Fifth ECMWF Workshop on the Use of
                 Parallel Processors in Meteorology. Parallel
                 Supercomputing in Atmospheric Science}",
  publisher =    pub-WORLD-SCI,
  address =      pub-WORLD-SCI:adr,
  pages =        "ix + 532",
  year =         "1993",
  ISBN =         "981-02-1429-4",
  ISBN-13 =      "978-981-02-1429-6",
  LCCN =         "QA76.58 E354 1992",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "23--27 Nov. 1992",
  conflocation = "Reading, UK",
  pubcountry =   "Singapore",
}

@Proceedings{IEEE:1993:DPC,
  editor =       "{IEEE}",
  booktitle =    "{Digest of papers: Compcon spring '93, San Francisco,
                 California, February 22--26, 1993}",
  title =        "{Digest of papers: Compcon spring '93, San Francisco,
                 California, February 22--26, 1993}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xv + 609",
  year =         "1993",
  ISBN =         "0-8186-3400-6",
  ISBN-13 =      "978-0-8186-3400-0",
  LCCN =         "QA75.5.C58 1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 93CH3251-6.",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1993:PFW,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Fourth Workshop on Future Trends
                 of Distributed Computing Systems, September 22--24,
                 1993, Lisbon, Portugal}",
  title =        "{Proceedings of the Fourth Workshop on Future Trends
                 of Distributed Computing Systems, September 22--24,
                 1993, Lisbon, Portugal}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "x + 485",
  year =         "1993",
  ISBN =         "0-8186-4430-3",
  ISBN-13 =      "978-0-8186-4430-6",
  LCCN =         "QA76.9.D5I335 1993",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 93TH0574-4.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
}

@Proceedings{IEEE:1993:PIS,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the 2nd International Symposium on
                 High Performance Distributed Computing, July 20--23,
                 1993, Spokane, Washington, Cavanaugh's Inn at the
                 Park}",
  title =        "{Proceedings of the 2nd International Symposium on
                 High Performance Distributed Computing, July 20--23,
                 1993, Spokane, Washington, Cavanaugh's Inn at the
                 Park}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiv + 353",
  year =         "1993",
  ISBN =         "0-8186-3900-8, 0-8186-3901-6",
  ISBN-13 =      "978-0-8186-3900-5, 978-0-8186-3901-2",
  LCCN =         "QA76.9.D5I593 1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 93TH0550-4.",
  series =       "Proceedings of the International Symposium on High
                 Performance Distributed Computing 2nd",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE Computer Society. Syracuse University; Northeast
                 Parallel Architectures Center. Washington State
                 University.",
}

@Proceedings{IEEE:1993:PSI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings / Seventh International Parallel
                 Processing Symposium, April 13--16, 1993, Newport
                 Beach, California}",
  title =        "{Proceedings / Seventh International Parallel
                 Processing Symposium, April 13--16, 1993, Newport
                 Beach, California}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xx + 858",
  year =         "1993",
  ISBN =         "0-8186-3442-1",
  ISBN-13 =      "978-0-8186-3442-0",
  LCCN =         "QA 76.58 I56 1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 93TH0513-2.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc.; ACM Sigarch",
}

@Proceedings{IEEE:1993:PSP,
  editor =       "{IEEE}",
  key =          "Supercomputing'93",
  booktitle =    "{Proceedings, Supercomputing '93: Portland, Oregon,
                 November 15--19, 1993}",
  title =        "{Proceedings, Supercomputing '93: Portland, Oregon,
                 November 15--19, 1993}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxii + 935",
  year =         "1993",
  ISBN =         "0-8186-4340-4 (paperback), 0-8186-4341-2 (microfiche),
                 0-8186-4342-0 (hardback), 0-8186-4346-3 (CD-ROM)",
  ISBN-13 =      "978-0-8186-4340-8 (paperback), 978-0-8186-4341-5
                 (microfiche), 978-0-8186-4342-2 (hardback),
                 978-0-8186-4346-0 (CD-ROM)",
  ISSN =         "1063-9535",
  LCCN =         "QA76.5 .S96 1993",
  bibdate =      "Mon Jan 15 11:06:21 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  classification = "631.1; 722.1; 722.3; 722.4; 723.2; 921.6",
  keywords =     "Algorithms; Cache coherence; Clustered workstations;
                 Computer graphics; Computer networks; Computer
                 programming languages; Data parallel compilers; Data
                 partitioning; Distributed computer systems; Eigenvalues
                 and eigenfunctions; Finite element method; Flow
                 visualization; Fluid mechanics; Linear algebra; Mass
                 storage; Massively parallel processors; Natural
                 sciences computing; Parallel languages; Parallel
                 processing systems; Parallel rendering; Program
                 compilers; Quantum theory; Scheduling; Sparse matrices;
                 Supercomputers",
  sponsor =      "Institute of Electrical and Electronics Engineers;
                 Computer Society. Association for Computing Machinery;
                 SIGARCH.",
}

@Proceedings{IEEE:1993:WHP,
  editor =       "{IEEE}",
  key =          "WHP'92",
  booktitle =    "{Workshop on Heterogeneous Processing (1992: Beverly
                 Hills, Calif.) Proceedings / Workshop on Heterogeneous
                 Processing, March 23, 1992, Beverly Hills,
                 California}",
  title =        "{Workshop on Heterogeneous Processing (1992: Beverly
                 Hills, Calif.) Proceedings / Workshop on Heterogeneous
                 Processing, March 23, 1992, Beverly Hills,
                 California}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 101",
  year =         "1993",
  ISBN =         "0-8186-2702-6",
  ISBN-13 =      "978-0-8186-2702-6",
  LCCN =         "QA76.58 .W654 1992",
  bibdate =      "Tue Jan 16 07:27:01 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Kowalik:1993:SPC,
  editor =       "Janusz S. Kowalik and Lucio Grandinetti",
  booktitle =    "{Software for parallel computation: Proceedings of the
                 NATO Advanced Workshop on Software for Parallel
                 Computation, held at Cetraro, Cosenza, Italy, June
                 22--26, 1992}",
  title =        "{Software for parallel computation: Proceedings of the
                 NATO Advanced Workshop on Software for Parallel
                 Computation, held at Cetraro, Cosenza, Italy, June
                 22--26, 1992}",
  volume =       "106",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "ix + 363",
  year =         "1993",
  ISBN =         "3-540-56451-9 (Berlin), 0-387-56451-9 (New York)",
  ISBN-13 =      "978-3-540-56451-5 (Berlin), 978-0-387-56451-7 (New
                 York)",
  LCCN =         "QA76.58 .S629 1993",
  bibdate =      "Sat Feb 24 09:43:28 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "NATO ASI series. Series F, Computer and systems
                 sciences",
  acknowledgement = ack-nhfb,
}

@Proceedings{Law:1993:EDM,
  editor =       "K. H. Law and R. E. Fulton and others",
  booktitle =    "{Engineering data management: key to success in a
                 global market: proceedings of the 1993 ASME
                 International Computers in Engineering Conference and
                 Exposition, August 8--12, San Diego, California}",
  title =        "{Engineering data management: key to success in a
                 global market: proceedings of the 1993 ASME
                 International Computers in Engineering Conference and
                 Exposition, August 8--12, San Diego, California}",
  publisher =    pub-ASME,
  address =      pub-ASME:adr,
  pages =        "vi + 273",
  year =         "1993",
  ISBN =         "0-7918-1169-7",
  ISBN-13 =      "978-0-7918-1169-6",
  LCCN =         "TA345.A86 1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "COMPUTERS IN ENGINEERING VOL COM",
  acknowledgement = ack-nhfb,
  sponsor =      "ASME; Computers in Engineering Division.",
}

@Proceedings{Mudge:1993:PTS,
  editor =       "T. N. Mudge and V. Milutinovic and L. Hunter",
  booktitle =    "{Proceedings of the Twenty-Sixth Hawaii International
                 Conference on System Science (HICSS-26), held in
                 Wailea, Hawaii in January 5--8, 1993}",
  title =        "{Proceedings of the Twenty-Sixth Hawaii International
                 Conference on System Science (HICSS-26), held in
                 Wailea, Hawaii in January 5--8, 1993}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvi + 895 (vol. 1), xiv + 691 (vol. 2), xii + 654
                 (vol. 3), xv + 889 (vol. 4)",
  year =         "1993",
  ISBN =         "0-8186-3230-5",
  ISBN-13 =      "978-0-8186-3230-3",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 11:35:41 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Four volumes. IEEE catalog number 93TH0501-7.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Schill:1993:DOD,
  editor =       "Alexander Schill",
  booktitle =    "{DCE} --- the {OSF} distributed computing environment:
                 client\slash server model and beyond: {International
                 DCE Workshop, Karlsruhe, Germany, October 7--8, 1993:
                 proceedings}",
  title =        "{DCE} --- the {OSF} distributed computing environment:
                 client\slash server model and beyond: {International
                 DCE Workshop, Karlsruhe, Germany, October 7--8, 1993:
                 proceedings}",
  number =       "731",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "283",
  year =         "1993",
  ISBN =         "3-540-57306-2, 0-387-57306-2",
  ISBN-13 =      "978-3-540-57306-7, 978-0-387-57306-9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.9.C55I58 1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  sponsor =      "German Association of Computer Science.",
}

@Proceedings{Sincovec:1993:SCP,
  editor =       "Richard F. Sincovec",
  booktitle =    "{SIAM Conference on Parallel Processing for Scientific
                 Computing (6th: 1993: Norfolk, VA, USA)}",
  title =        "{SIAM Conference on Parallel Processing for Scientific
                 Computing (6th: 1993: Norfolk, VA, USA)}",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "xix + 1041 + iv",
  year =         "1993",
  ISBN =         "0-89871-315-3",
  ISBN-13 =      "978-0-89871-315-2",
  LCCN =         "QA 76.58 S55 1993",
  bibdate =      "Wed Aug 14 10:36:11 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  acknowledgement = ack-nhfb,
  sponsor =      "Society for Industrial and Applied Mathematics.",
}

@Proceedings{Volkert:1993:PCS,
  editor =       "Jens Volkert",
  booktitle =    "{Parallel computation: Second International ACPC
                 Conference, Gmunden, Austria, October 4--6, 1993:
                 proceedings}",
  title =        "{Parallel computation: Second International ACPC
                 Conference, Gmunden, Austria, October 4--6, 1993:
                 proceedings}",
  volume =       "734",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "viii + 248",
  year =         "1993",
  ISBN =         "3-540-57314-3 (Berlin), 0-387-57314-3 (New York)",
  ISBN-13 =      "978-3-540-57314-2 (Berlin), 978-0-387-57314-4 (New
                 York)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA267.A1 L43 no.734",
  bibdate =      "Wed Apr 16 11:41:47 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "DM58.00",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "parallel processing (electronic computers) --
                 congresses",
  sponsor =      "Austrian Center for Parallel Computation.",
}

@Proceedings{Yelon:1993:PTS,
  editor =       "W. B. Yelon and others",
  booktitle =    "{Proceedings of the Thirty-seventh Annual Conference
                 on Magnetism and Magnetic Materials: December 1--4,
                 1992, Houston, Texas}",
  title =        "{Proceedings of the Thirty-seventh Annual Conference
                 on Magnetism and Magnetic Materials: December 1--4,
                 1992, Houston, Texas}",
  volume =       "73(10)",
  publisher =    pub-AIP,
  address =      pub-AIP:adr,
  pages =        "5309--7023",
  month =        may,
  year =         "1993",
  CODEN =        "JAPIAU",
  ISBN =         "1-56396-212-8",
  ISBN-13 =      "978-1-56396-212-7",
  ISSN =         "0021-8979 (print), 1089-7550 (electronic), 1520-8850",
  LCCN =         "QC753 .C748 1990",
  bibdate =      "Sun Dec 22 10:17:40 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  series =       j-J-APPL-PHYS,
  acknowledgement = ack-nhfb,
  confsponsor =  "AIP; IEEE",
}

@Proceedings{ACM:1994:CPI,
  editor =       "{ACM}",
  booktitle =    "{Conference Proceedings. 1994 International Conference
                 on Supercomputing}",
  title =        "{Conference Proceedings. 1994 International Conference
                 on Supercomputing}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xii + 439",
  year =         "1994",
  ISBN =         "0-89791-665-4",
  ISBN-13 =      "978-0-89791-665-3",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/contents/proceedings/supercomputing/181181/",
  acknowledgement = ack-nhfb,
  confdate =     "11--15 July 1994",
  conflocation = "Manchester, UK",
  confsponsor =  "ACM",
}

@Proceedings{Agrawal:1994:PIC,
  editor =       "Dharma P. Agrawal and K. C. (Kuo Chung) Tai and
                 Jagdish Chandra",
  booktitle =    "{Proceedings of the 1994 International Conference on
                 Parallel Processing, August 15--19, 1994. Vol 3:
                 Algorithms and applications}",
  title =        "{Proceedings of the 1994 International Conference on
                 Parallel Processing, August 15--19, 1994. Vol 3:
                 Algorithms and applications}",
  publisher =    pub-CRC,
  address =      pub-CRC:adr,
  pages =        "xvii + 301 (vol. 1), xviii + 323 (vol. 2), 297 (vol.
                 3)",
  year =         "1994",
  ISBN =         "0-8493-2496-3, 0-8493-2495-5",
  ISBN-13 =      "978-0-8493-2496-3, 978-0-8493-2495-6",
  ISSN =         "0190-3918",
  LCCN =         "QA 76.58 I55 1994",
  bibdate =      "Wed Aug 14 10:37:00 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Three volumes.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1994:FWR,
  editor =       "Anonymous",
  booktitle =    "{Forschung und wissenschaftliches Rechnen: Beitrage
                 anasslich des 10. EDV-Benutzertreffens der
                 Max-Planck-Gesellschaft in G{\"o}ttingen, November
                 1993}",
  title =        "{Forschung und wissenschaftliches Rechnen: Beitrage
                 anasslich des 10. EDV-Benutzertreffens der
                 Max-Planck-Gesellschaft in G{\"o}ttingen, November
                 1993}",
  number =       "1",
  publisher =    "Max-Planck-Gesellschaft",
  address =      "M{\"u}nchen, Germany",
  pages =        "270",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0341-7778",
  LCCN =         "Q180.55.E4 M39 1993",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Berichte und Mitteilungen --- Max Planck
                 Gesellschaft",
  acknowledgement = ack-nhfb,
  sponsor =      "Max-Planck-Gesellschaft.",
}

@Proceedings{Anonymous:1994:ICS,
  editor =       "Anonymous",
  booktitle =    "{1994 International Computer Symposium Conference
                 Proceedings}",
  title =        "{1994 International Computer Symposium Conference
                 Proceedings}",
  publisher =    "Nat. Chiao Tung Univ",
  address =      "Hsinchu, Taiwan",
  pages =        "xvi + 1310",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "2 vol.",
  acknowledgement = ack-nhfb,
  confdate =     "12--15 Dec. 1994",
  conflocation = "Hsinchu, Taiwan",
  confsponsor =  "Ministr. Educ.; Comput. Soc",
  pubcountry =   "Taiwan",
}

@Proceedings{Anonymous:1994:PDC,
  editor =       "Anonymous",
  booktitle =    "{Parallel and distributed computing systems:
                 proceedings of the ISCA International Conference, Las
                 Vegas, Nevada, U.S.A., October 6--8, 1994}",
  title =        "{Parallel and distributed computing systems:
                 proceedings of the ISCA International Conference, Las
                 Vegas, Nevada, U.S.A., October 6--8, 1994}",
  publisher =    "ISCA",
  address =      "Raleigh, NC, USA",
  pages =        "x + 870",
  year =         "1994",
  ISBN =         "1-880843-09-9",
  ISBN-13 =      "978-1-880843-09-3",
  LCCN =         "QA76.58.I543 1994",
  bibdate =      "Fri Feb 01 06:55:36 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1994:PPC,
  editor =       "Anonymous",
  booktitle =    "{Parallel processing comes of age: real applications
                 from industry and commerce: Seminar --- June 1994,
                 London}",
  title =        "{Parallel processing comes of age: real applications
                 from industry and commerce: Seminar --- June 1994,
                 London}",
  publisher =    "Unicom Seminars",
  address =      "????",
  pages =        "????",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "Unicom.",
}

@Proceedings{Anonymous:1994:PSE,
  editor =       "Anonymous",
  booktitle =    "{Proceedings. SHARE Europe Spring Conference}",
  title =        "{Proceedings. SHARE Europe Spring Conference}",
  publisher =    "SHARE Europe (SEAS)",
  address =      "Carouge/Geneva, Switzerland",
  pages =        "xix + 810",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "18--21 April 1994",
  conflocation = "Brussels, Belgium",
  pubcountry =   "Switzerland",
}

@Proceedings{Anonymous:1994:SCC,
  editor =       "Anonymous",
  booktitle =    "{Small college computing: 27th Annual symposium ---
                 April 1994, Winona, MN}",
  title =        "{Small college computing: 27th Annual symposium ---
                 April 1994, Winona, MN}",
  publisher =    "SCCS",
  address =      "????",
  pages =        "????",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "SCCS --- Proceedings --- 27th",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1994:SQC,
  editor =       "Anonymous",
  booktitle =    "{Software quality concern for people: proceedings of
                 the fourth European Conference on Software Quality,
                 October 17--20, 1994, Basel, Switzerland}",
  title =        "{Software quality concern for people: proceedings of
                 the fourth European Conference on Software Quality,
                 October 17--20, 1994, Basel, Switzerland}",
  publisher =    "vdf Verlag der Fachvereine",
  address =      "Zurich, Switzerland",
  pages =        "538",
  year =         "1994",
  ISBN =         "3-7281-2153-3",
  ISBN-13 =      "978-3-7281-2153-0",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 11:49:47 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Arnold:1994:PCT,
  editor =       "D. Arnold and R. Christie and J. Day and P. Roe",
  booktitle =    "{Parallel Computing and Transputers. PCAT-93.
                 Proceedings of the 6th Australian Transputer and Occam
                 User Group Conference, November 3--4, 1993, Brisbane,
                 Queensland, Australia}",
  title =        "{Parallel Computing and Transputers. PCAT-93.
                 Proceedings of the 6th Australian Transputer and Occam
                 User Group Conference, November 3--4, 1993, Brisbane,
                 Queensland, Australia}",
  volume =       "37",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "383",
  year =         "1994",
  ISBN =         "90-5199-149-5",
  ISBN-13 =      "978-90-5199-149-9",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Transputer and Occam Engineering Series",
  acknowledgement = ack-nhfb,
  pubcountry =   "Netherlands",
}

@Proceedings{Becks:1994:NCT,
  editor =       "K.-H. Becks and D. Perret-Gallix",
  booktitle =    "{New computing techniques in physics research III:
                 proceedings of the Third International Workshop on
                 Software Engineering, Artificial Intelligence and
                 Expert Systems for High Energy and Nuclear Physics:
                 October 4--8, 1993, Oberammergau, Germany}",
  title =        "{New computing techniques in physics research III:
                 proceedings of the Third International Workshop on
                 Software Engineering, Artificial Intelligence and
                 Expert Systems for High Energy and Nuclear Physics:
                 October 4--8, 1993, Oberammergau, Germany}",
  publisher =    pub-WORLD-SCI,
  address =      pub-WORLD-SCI:adr,
  pages =        "xvii + 664",
  year =         "1994",
  ISBN =         "981-02-1699-8",
  ISBN-13 =      "978-981-02-1699-3",
  LCCN =         "QC793.47.E4I58 1993",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  pubcountry =   "Singapore",
}

@Proceedings{Bolding:1994:PCR,
  editor =       "Kevin Bolding and Lawrence Snyder",
  booktitle =    "{Parallel computer routing and communication: first
                 international workshop, PCRCW '94, Seattle, Washington,
                 USA, May 16--18, 1994: proceedings}",
  title =        "{Parallel computer routing and communication: first
                 international workshop, PCRCW '94, Seattle, Washington,
                 USA, May 16--18, 1994: proceedings}",
  number =       "853",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "ix + 317",
  year =         "1994",
  ISBN =         "3-540-58429-3",
  ISBN-13 =      "978-3-540-58429-2",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.P39 1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  conflocation = "Seattle, WA, USA; 16-18 May 1994",
  corpsource =   "Dept. of Comput. Sci. and Eng., Washington Univ.,
                 Seattle, WA, USA",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Proceedings{Calmet:1994:RWC,
  editor =       "J. Calmet",
  booktitle =    "{Rhine workshop on computer algebra --- March 22--24,
                 1994, Karlsruhe, Germany}",
  title =        "{Rhine workshop on computer algebra --- March 22--24,
                 1994, Karlsruhe, Germany}",
  publisher =    "Universit{\"a}t Karlsruhe",
  address =      "Karlsruhe, Germany",
  pages =        "v + 224",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "University of Karlsruhe. Faculty of Informatics.
                 Institute of Algorithms and Cognitive Systems.",
}

@Proceedings{Davidor:1994:PPS,
  editor =       "Yuval Davidor and Hans-Paul Schwefel and Reinhard
                 Manner",
  booktitle =    "{Parallel problem solving from nature --- PPSN III:
                 International Conference on Evolutionary Computation,
                 the Third Conference on Parallel Problem Solving from
                 Nature, Jerusalem, Israel, October 9--14, 1994:
                 proceedings}",
  title =        "{Parallel problem solving from nature --- PPSN III:
                 International Conference on Evolutionary Computation,
                 the Third Conference on Parallel Problem Solving from
                 Nature, Jerusalem, Israel, October 9--14, 1994:
                 proceedings}",
  number =       "866",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xv + 642",
  year =         "1994",
  ISBN =         "3-540-58484-6",
  ISBN-13 =      "978-3-540-58484-1",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .I535 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
}

@Proceedings{Decker:1994:PEM,
  editor =       "K. M. (Karsten M.) Decker and R. M. (Rene M.)
                 Rehmann",
  booktitle =    "{Programming environments for massively parallel
                 distributed systems: working conference of the IFIP
                 WG10.3, April 25--29, 1994, Ascona, Italy}",
  title =        "{Programming environments for massively parallel
                 distributed systems: working conference of the IFIP
                 WG10.3, April 25--29, 1994, Ascona, Italy}",
  publisher =    pub-BIRKHAUSER,
  address =      pub-BIRKHAUSER:adr,
  pages =        "xiv + 420",
  year =         "1994",
  ISBN =         "0-8176-5090-3 (Boston), 3-7643-5090-3 (Basel)",
  ISBN-13 =      "978-0-8176-5090-2 (Boston), 978-3-7643-5090-1
                 (Basel)",
  LCCN =         "QA76.58.P767 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "IFIP WG10.3.",
}

@Proceedings{deGloria:1994:TAS,
  editor =       "A. de Gloria and M. R. Jane and D. Marini",
  booktitle =    "{Transputer Applications and Systems '94. Proceedings
                 of the 1994 World Transputer Congress}",
  title =        "{Transputer Applications and Systems '94. Proceedings
                 of the 1994 World Transputer Congress}",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "xi + 1009",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "5--7 Sept. 1994",
  conflocation = "Como, Italy",
  confsponsor =  "Transputer Consortium; SGS-Thomson Microelectron.;
                 Eur. Union; Italian Transputer User Group",
  pubcountry =   "Netherlands",
}

@Proceedings{Dekker:1994:MPP,
  editor =       "L. (Leendert) Dekker and W. Smit and J. C.
                 Zuidervaart",
  booktitle =    "{Massively parallel processing applications and
                 development: proceedings of the 1994 EUROSIM Conference
                 on Massively Parallel Processing Applications and
                 Development, Delft, The Netherlands, 21--23 June
                 1994}",
  title =        "{Massively parallel processing applications and
                 development: proceedings of the 1994 EUROSIM Conference
                 on Massively Parallel Processing Applications and
                 Development, Delft, The Netherlands, 21--23 June
                 1994}",
  publisher =    pub-ELS,
  address =      pub-ELS:adr,
  pages =        "xxii + 973",
  year =         "1994",
  ISBN =         "0-444-81784-0",
  ISBN-13 =      "978-0-444-81784-6",
  LCCN =         "QA76.58.E98 1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confsponsor =  "AKZO NOBEL; BSO; Convex Comput.; HPCN projects; IBM;
                 NOWESP; et al",
  pubcountry =   "Netherlands",
}

@Proceedings{Dongarra:1994:PSC,
  editor =       "Jack Dongarra and Jerzy Wasniewski",
  booktitle =    "{Parallel scientific computing: First International
                 Workshop, PARA '94, Lyngby, Denmark, June 20--23, 1994:
                 proceedings}",
  title =        "{Parallel scientific computing: First International
                 Workshop, PARA '94, Lyngby, Denmark, June 20--23, 1994:
                 proceedings}",
  volume =       "879",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xi + 566",
  year =         "1994",
  ISBN =         "3-540-58712-8 (Berlin), 0-387-58712-8 (New York)",
  ISBN-13 =      "978-3-540-58712-5 (Berlin), 978-0-387-58712-7 (New
                 York)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .P35 1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  price =        "DM104.00",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  confsponsor =  "Danish Comput. Centre for Res. and Educ.; Inst. Math.
                 Modelling; Tech. Univ. Denmark",
  pubcountry =   "Germany",
  sponsor =      "Danish Computing Centre for Research and Education.
                 Technical University of Denmark; Institute for
                 Mathematical Modelling.",
}

@Proceedings{Dongarra:1994:PSW,
  editor =       "Jack J. Dongarra and Bernard Tourancheau",
  booktitle =    "{Proceedings of the Second Workshop on Environments
                 and Tools for Parallel Scientific Computing: Townsend,
                 TN, USA, 25--27 May 1994}",
  title =        "{Proceedings of the Second Workshop on Environments
                 and Tools for Parallel Scientific Computing: Townsend,
                 TN, USA, 25--27 May 1994}",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "x + 292",
  year =         "1994",
  ISBN =         "0-89871-343-9",
  ISBN-13 =      "978-0-89871-343-5",
  LCCN =         "QA76.58.I568 1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  conflocation = "Townsend, TN, USA; 25-27 May 1994",
  conftitle =    "Proceedings of the Second Workshop on Environments and
                 Tools for Parallel Scientific Computing",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  treatment =    "P Practical",
}

@Proceedings{Gentzsch:1994:HPC,
  editor =       "Wolfgang Gentzsch and Uwe Harms",
  booktitle =    "{High-performance computing and networking:
                 international conference and exhibition, Munich,
                 Germany, April 18--20, 1994: proceedings}",
  title =        "{High-performance computing and networking:
                 international conference and exhibition, Munich,
                 Germany, April 18--20, 1994: proceedings}",
  volume =       "797",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xxii + 519",
  year =         "1994",
  ISBN =         "0-387-57981-8 (New York), 3-540-57981-8 (Berlin)",
  ISBN-13 =      "978-0-387-57981-8 (New York), 978-3-540-57981-6
                 (Berlin)",
  LCCN =         "QA76.88.I57 1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  price =        "DM96.00",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  conftitle =    "High-Performance Computing and Networking
                 International Conference. Proceedings, Volume II:
                 Networking and Tools",
  corpsource =   "German Nat. Res. Center for Comput. Sci., St.
                 Augustin, Germany",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Proceedings{Gruber:1994:PJE,
  editor =       "Ralf Gruber and Marco Tomassini",
  booktitle =    "{Proceedings of the 6th Joint EPS-APS International
                 Conference on Physics Computing: Physics Computing '94,
                 Palazzo dei Congressi, Lugano, Switzerland, 22--26
                 August 1994}",
  title =        "{Proceedings of the 6th Joint EPS-APS International
                 Conference on Physics Computing: Physics Computing '94,
                 Palazzo dei Congressi, Lugano, Switzerland, 22--26
                 August 1994}",
  publisher =    "European Physical Society",
  address =      "Geneva, Switzerland",
  pages =        "xvii + 730",
  year =         "1994",
  ISBN =         "2-88270-011-3",
  ISBN-13 =      "978-2-88270-011-7",
  LCCN =         "QC20.7.E4I58 1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  pubcountry =   "Switzerland",
}

@Proceedings{Hesham:1994:PTS,
  editor =       "E.-R. Hesham and B. D. Shriver",
  booktitle =    "{Proceedings of the Twenty-Seventh Hawaii
                 International Conference on System Sciences. Vol. II:
                 Software Technology, January 4--7, 1994, Wailea, HI,
                 USA}",
  title =        "{Proceedings of the Twenty-Seventh Hawaii
                 International Conference on System Sciences. Vol. II:
                 Software Technology, January 4--7, 1994, Wailea, HI,
                 USA}",
  volume =       "27",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xv + 681",
  year =         "1994",
  ISBN =         "0-8186-5060-5",
  ISBN-13 =      "978-0-8186-5060-4",
  ISSN =         "1060-3425",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 94TH0607-2.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE; ACM; Univ. Hawaii; Univ. Hawaii Coll. Bus.
                 Admin",
}

@Proceedings{Horiguchi:1994:ISP,
  editor =       "S. Horiguchi and D. Frank Hsu and M. Kimura",
  booktitle =    "{International Symposium on Parallel Architectures,
                 Algorithms, and Networks (ISPAN): proceedings of the
                 1994, December 14--16, 1994, Kanazawa, Japan}",
  title =        "{International Symposium on Parallel Architectures,
                 Algorithms, and Networks (ISPAN): proceedings of the
                 1994, December 14--16, 1994, Kanazawa, Japan}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 452",
  year =         "1994",
  ISBN =         "0-8186-6507-6 (case), 0-8186-6506-8 (microfiche)",
  ISBN-13 =      "978-0-8186-6507-3 (case), 978-0-8186-6506-6
                 (microfiche)",
  LCCN =         "QA76.58 .I5673 1994 Bar",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 94TH0697-3.",
  acknowledgement = ack-nhfb,
  keywords =     "parallel processing (electronic computers) --
                 congresses",
}

@Proceedings{IEEE:1994:FSF,
  editor =       "{IEEE}",
  booktitle =    "{Frontiers'95, the 5th Symposium on the Frontiers of
                 Massively Parallel Computation: proceedings, February
                 6--9, 1995, McLean, Virginia}",
  title =        "{Frontiers'95, the 5th Symposium on the Frontiers of
                 Massively Parallel Computation: proceedings, February
                 6--9, 1995, McLean, Virginia}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvi + 539",
  year =         "1994",
  ISBN =         "0-8186-6965-9",
  ISBN-13 =      "978-0-8186-6965-1",
  LCCN =         "QA76.58.S95 1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95TH8024.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc. Tech. Committee on Comput. Archit.;
                 NASA; Univ. Maryland Inst. Adv. Comput. Studies; George
                 Mason Univ",
}

@Proceedings{IEEE:1994:IPN,
  editor =       "{IEEE}",
  booktitle =    "{ICIP '94: proceedings, November 13--16, 1994, Austin
                 Convention Center, Austin, Texas}",
  title =        "{ICIP '94: proceedings, November 13--16, 1994, Austin
                 Convention Center, Austin, Texas}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "liii + 992 (vol. 1), 1064 (vol. 2), 1050 (vol. 3)",
  year =         "1994",
  ISBN =         "0-8186-6952-7 (casebound), 0-8186-6950-0 (paperback),
                 0-8186-6951-9 (microfiche)",
  ISBN-13 =      "978-0-8186-6952-1 (casebound), 978-0-8186-6950-7
                 (paperback), 978-0-8186-6951-4 (microfiche)",
  LCCN =         "TA1637.I25 1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Three volumes. IEEE catalog no. 94CH35708.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Signal Process. Soc",
}

@Proceedings{IEEE:1994:OOE,
  editor =       "{IEEE}",
  booktitle =    "{Oceans 94: Oceans engineering for today's technology
                 and tomorrow's preservation: proceedings, 13--16
                 September 13--16, 1994, Brest, France}",
  title =        "{Oceans 94: Oceans engineering for today's technology
                 and tomorrow's preservation: proceedings, 13--16
                 September 13--16, 1994, Brest, France}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xl + 905 (vol. 1), xl + 727 (vol. 2), xl + 630 (vol.
                 3)",
  year =         "1994",
  ISBN =         "0-7803-2057-3, 0-7803-2056-5, 0-7803-2058-1",
  ISBN-13 =      "978-0-7803-2057-4, 978-0-7803-2056-7,
                 978-0-7803-2058-1",
  ISSN =         "0197-7385",
  LCCN =         "TC 1505 O33197 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Three volumes. IEEE catalog no. 94CH3472-8.",
  series =       "Oceans",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE; Ocean Engineering Society.",
}

@Proceedings{IEEE:1994:PIF,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the 1994 IEEE Frequency Control
                 Symposium (the 48th annual symposium), 1--3 June 1994,
                 Westin Hotel-Copley Place, Boston, Massachusetts,
                 USA}",
  title =        "{Proceedings of the 1994 IEEE Frequency Control
                 Symposium (the 48th annual symposium), 1--3 June 1994,
                 Westin Hotel-Copley Place, Boston, Massachusetts,
                 USA}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvii + 817",
  year =         "1994",
  ISBN =         "0-7803-1945-1",
  ISBN-13 =      "978-0-7803-1945-5",
  LCCN =         "TK 7872 O7 I34 1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 94CH3446-2.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Ultrasonics, Ferroelectr. and Frequency Control
                 Soc",
  numericalindex = "Frequency 1.0E+09 to 2.0E+09 Hz",
}

@Proceedings{IEEE:1994:PSI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings / Second International Workshop on
                 Configurable Distributed Systems, March 21--23, 1994,
                 Carnegie Mellon University, Pittsburgh, Pennsylvania}",
  title =        "{Proceedings / Second International Workshop on
                 Configurable Distributed Systems, March 21--23, 1994,
                 Carnegie Mellon University, Pittsburgh, Pennsylvania}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 215",
  year =         "1994",
  ISBN =         "0-8186-5390-6",
  ISBN-13 =      "978-0-8186-5390-2",
  LCCN =         "QA76.9.D5I595 1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 94TH0651-0.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE; Carnegie Mellon Univ",
}

@Proceedings{IEEE:1994:PSP,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Scalable Parallel Libraries
                 Conference, October 6--8, 1993, Mississippi State,
                 Mississippi}",
  title =        "{Proceedings of the Scalable Parallel Libraries
                 Conference, October 6--8, 1993, Mississippi State,
                 Mississippi}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "vii + 261",
  year =         "1994",
  ISBN =         "0-8186-4980-1",
  ISBN-13 =      "978-0-8186-4980-6",
  LCCN =         "QA76.58.S34 1993",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  conflocation = "Mississippi State, MS, USA; 6-8 Oct. 1993",
  confsponsor =  "Mississippi State Univ.; Nat. Sci. Found",
  conftitle =    "Proceedings of Scalable Parallel Libraries
                 Conference",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  sponsororg =   "Mississippi State Univ.; Nat. Sci. Found",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1994:PSW,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings, Supercomputing '94: Washington, DC,
                 November 14--18, 1994}",
  title =        "{Proceedings, Supercomputing '94: Washington, DC,
                 November 14--18, 1994}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvii + 823",
  year =         "1994",
  ISBN =         "0-8186-6607-2, 0-8186-6605-6, 0-8186-6606-4",
  ISBN-13 =      "978-0-8186-6607-0, 978-0-8186-6605-6,
                 978-0-8186-6606-3",
  ISSN =         "1063-9535",
  LCCN =         "QA76.5 .S894 1994",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 94CH34819.",
  series =       "Supercomputing",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE.",
}

@Proceedings{IEEE:1994:PTI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Third IEEE International Symposium
                 on High Performance Distributed Computing, August 2--5,
                 1994, San Francisco, California}",
  title =        "{Proceedings of the Third IEEE International Symposium
                 on High Performance Distributed Computing, August 2--5,
                 1994, San Francisco, California}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 304",
  year =         "1994",
  ISBN =         "0-8186-6395-2",
  ISBN-13 =      "978-0-8186-6395-6",
  LCCN =         "QA76.9.D5I328 1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 94TH0667-6.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process.; Northeast Parallel Archit. Center (NPAC) at
                 Syracuse Univ.; ACM SIGCOMM",
}

@Proceedings{Joubert:1994:PCT,
  editor =       "G. R. Joubert and F. J. Peters and D. Trystram and D.
                 J. Evans",
  booktitle =    "{Parallel computing: trends and applications:
                 proceedings of the international conference ParCo93,
                 Grenoble, France, 7--10 September 1993}",
  title =        "{Parallel computing: trends and applications:
                 proceedings of the international conference ParCo93,
                 Grenoble, France, 7--10 September 1993}",
  volume =       "9",
  publisher =    pub-NH,
  address =      pub-NH:adr,
  pages =        "xvi + 728",
  year =         "1994",
  ISBN =         "0-444-81841-3",
  ISBN-13 =      "978-0-444-81841-6",
  LCCN =         "QA76.58 .P3794 1993",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Advances in parallel computing",
  acknowledgement = ack-nhfb,
  confsponsor =  "ARCHIPEL; CNRS; Elsevier Sci. Publishers; IMAG; INPG;
                 INRIA; et al",
  pubcountry =   "Netherlands",
  xxeditor =     "G. R. Joubert and D. Trystram and F. J. Peters and D.
                 J. Evans",
}

@Proceedings{Kumar:1994:PPI,
  editor =       "V. K. Prasanna Kumar",
  booktitle =    "{Parallel processing: 1st IWWP: proceedings of the
                 First International Workshop on Parallel Processing
                 (IWPP-94), December 26--31, 1994, Bangalore, India}",
  title =        "{Parallel processing: 1st IWWP: proceedings of the
                 First International Workshop on Parallel Processing
                 (IWPP-94), December 26--31, 1994, Bangalore, India}",
  publisher =    "Tata McGraw-Hill Pub. Co",
  address =      "New Delhi, India",
  pages =        "xxiii + 736",
  year =         "1994",
  ISBN =         "0-07-462332-X",
  ISBN-13 =      "978-0-07-462332-9",
  LCCN =         "QA 76.58 I587 1994",
  bibdate =      "Tue May 12 08:53:36 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Miles:1994:PTO,
  editor =       "Roger Miles and Alan Chalmers",
  booktitle =    "{Progress in Transputer and occam Research, WoTUG-17
                 Proceedings of the 17th World occam and Transputer User
                 Group Technical Meeting, April 10--13, 1994, Bristol,
                 UK}",
  title =        "{Progress in Transputer and occam Research, WoTUG-17
                 Proceedings of the 17th World occam and Transputer User
                 Group Technical Meeting, April 10--13, 1994, Bristol,
                 UK}",
  volume =       "38",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "vii + 221",
  year =         "1994",
  ISBN =         "90-5199-163-0",
  ISBN-13 =      "978-90-5199-163-5",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Transputer and Occam Engineering Series",
  acknowledgement = ack-nhfb,
  pubcountry =   "Netherlands",
  sponsor =      "World occam and Transputer User Group.",
}

@Proceedings{Ostrand:1994:PIS,
  editor =       "Thomas Ostrand",
  booktitle =    "{Proceedings of the 1994 International Symposium on
                 Software Testing and Analysis (ISSTA): August 17--19,
                 1994, Seattle, Washington, USA}",
  title =        "{Proceedings of the 1994 International Symposium on
                 Software Testing and Analysis (ISSTA): August 17--19,
                 1994, Seattle, Washington, USA}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  year =         "1994",
  CODEN =        "SFENDP",
  ISBN =         "0-89791-683-2",
  ISBN-13 =      "978-0-89791-683-7",
  ISSN =         "0163-5948",
  LCCN =         "QA76.76.T48 I58 1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  series =       j-SIGSOFT,
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  issue =        "spec. issue. p. 216-227",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Proceedings{Pehrson:1994:IPP,
  editor =       "Bj{\"o}rn Pehrson and Imre Simon and Klaus Brunnstein
                 and Eckart Raubold and Karen Duncan and Karl Krueger",
  booktitle =    "{Information processing '94: proceedings of the IFIP
                 13th World Computer Congress, Hamburg, Germany, 28
                 August--2 September, 1994}",
  title =        "{Information processing '94: proceedings of the IFIP
                 13th World Computer Congress, Hamburg, Germany, 28
                 August--2 September, 1994}",
  volume =       "A-51, A-52, A-53",
  publisher =    pub-NH,
  address =      pub-NH:adr,
  pages =        "402--409",
  year =         "1994",
  CODEN =        "ITATEC",
  ISBN =         "0-444-81990-8, 0-444-81989-4",
  ISBN-13 =      "978-0-444-81990-1, 978-0-444-81989-5",
  ISSN =         "0926-5473",
  LCCN =         "QA75.5.I3785 1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Three volumes.",
  series =       j-IFIP-TRANS-A,
  acknowledgement = ack-nhfb,
  pubcountry =   "Netherlands",
  sponsor =      "IFIP. Gesellschaft fur Informatik.",
}

@Proceedings{Pierce:1994:PSH,
  editor =       "P. Pierce and G. Regnier",
  booktitle =    "{Proceedings of the Scalable High-Per\-for\-mance
                 Computing Conference, May 23--25, 1994, Knoxville,
                 Tennessee}",
  title =        "{Proceedings of the Scalable High-Per\-for\-mance
                 Computing Conference, May 23--25, 1994, Knoxville,
                 Tennessee}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 852",
  year =         "1994",
  ISBN =         "0-8186-5680-8, 0-8186-5681-6",
  ISBN-13 =      "978-0-8186-5680-4, 978-0-8186-5681-1",
  LCCN =         "QA76.58.S32 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 94TH0637-9.",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE Computer Society; Technical Committee on
                 Supercomputing Applications.",
}

@Proceedings{Sall:1994:CIS,
  editor =       "J. Sall and A. Lehman",
  booktitle =    "{Computational intensive statistical methods: 26th
                 Symposium on the interface --- June 15-18, 1994,
                 Research Triangle Park, NC, USA}",
  title =        "{Computational intensive statistical methods: 26th
                 Symposium on the interface --- June 15-18, 1994,
                 Research Triangle Park, NC, USA}",
  volume =       "26",
  publisher =    "Fairfax Station: Interface Foundation of North
                 America",
  address =      "????",
  pages =        "????",
  year =         "1994",
  ISBN =         "1-886658-00-5",
  ISBN-13 =      "978-1-886658-00-4",
  LCCN =         "QA276.4.S95 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Computing Science and Statistics Conference",
  acknowledgement = ack-nhfb,
  sponsor =      "Interface Foundation of North America.",
}

@Proceedings{Siegal:1994:PEI,
  editor =       "Howard Jay Siegal",
  booktitle =    "{Proceedings / Eighth International Parallel
                 Processing Symposium, April 26--29, 1994, Cancun,
                 Mexico}",
  title =        "{Proceedings / Eighth International Parallel
                 Processing Symposium, April 26--29, 1994, Cancun,
                 Mexico}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxx + 966",
  year =         "1994",
  ISBN =         "0-8186-5602-6",
  ISBN-13 =      "978-0-8186-5602-6",
  LCCN =         "QA76.58.I58 1994",
  bibdate =      "Sun Dec 22 10:18:08 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 94CH34819.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE; ACM",
}

@Proceedings{Turchi:1994:SDA,
  editor =       "Patrice E. A. Turchi and Antonios Gonis",
  booktitle =    "{Statics and dynamics of alloy phase transformations:
                 Proceedings of a NATO Advanced Study Institute on
                 Statics and Dynamics of Alloy Phase Transformations,
                 held June 21--July 3, 1992, in Rhodes, Greece}",
  title =        "{Statics and dynamics of alloy phase transformations:
                 Proceedings of a NATO Advanced Study Institute on
                 Statics and Dynamics of Alloy Phase Transformations,
                 held June 21--July 3, 1992, in Rhodes, Greece}",
  volume =       "319",
  publisher =    pub-PLENUM,
  address =      pub-PLENUM:adr,
  pages =        "xiii + 737",
  year =         "1994",
  ISBN =         "0-306-44626-X",
  ISBN-13 =      "978-0-306-44626-9",
  ISSN =         "0258-1221",
  LCCN =         "TN690.S77 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "NATO ASI Series B Physics",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:1994:PFU,
  editor =       "{USENIX}",
  booktitle =    "{Proceedings of the First USENIX Symposium on
                 Operating Systems Design and Implementation (OSDI),
                 November 14--17, 1994, Monterey, California, USA}",
  title =        "{Proceedings of the First USENIX Symposium on
                 Operating Systems Design and Implementation (OSDI),
                 November 14--17, 1994, Monterey, California, USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "280",
  year =         "1994",
  ISBN =         "1-880446-66-9",
  ISBN-13 =      "978-1-880446-66-9",
  LCCN =         "QA 76.76 O63 U87 1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confsponsor =  "ACM; IEEE",
}

@Proceedings{Wagner:1994:CFD,
  editor =       "S. (Siegfried) Wagner and J. (Jacques) Periaux and E.
                 H. (Ernst-Heinrich) Hirschel",
  booktitle =    "{Computational fluid dynamics '94: proceedings of the
                 Second European Computational Fluid Dynamics
                 Conference, 5--8 September 1994, Stuttgart, Germany}",
  title =        "{Computational fluid dynamics '94: proceedings of the
                 Second European Computational Fluid Dynamics
                 Conference, 5--8 September 1994, Stuttgart, Germany}",
  publisher =    pub-WILEY,
  address =      pub-WILEY:adr,
  pages =        "xvi + 1029",
  year =         "1994",
  ISBN =         "0-471-95063-7",
  ISBN-13 =      "978-0-471-95063-9",
  LCCN =         "QA911.E95 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "European Committee on Computational Methods in Applied
                 Sciences.",
}

@Proceedings{ACM:1995:PAS,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 33rd annual southeast conference
                 [ACM]: Clemson, South Carolina, March 17--18, 1995}",
  title =        "{Proceedings of the 33rd annual southeast conference
                 [ACM]: Clemson, South Carolina, March 17--18, 1995}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "290",
  year =         "1995",
  ISBN =         "0-89791-747-2",
  ISBN-13 =      "978-0-89791-747-6",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 13:28:48 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "17--18 March 1995",
  conflocation = "Clemson, SC, USA",
  confsponsor =  "ACM",
}

@Proceedings{ACM:1995:SAA,
  editor =       "{ACM}",
  booktitle =    "{SPAA '95, 7th Annual ACM Symposium on Parallel
                 Algorithms and Architectures: July 17--19, 1995, Santa
                 Barbara, CA, USA}",
  title =        "{SPAA '95, 7th Annual ACM Symposium on Parallel
                 Algorithms and Architectures: July 17--19, 1995, Santa
                 Barbara, CA, USA}",
  volume =       "7",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "viii + 308",
  year =         "1995",
  ISBN =         "0-89791-717-0",
  ISBN-13 =      "978-0-89791-717-9",
  LCCN =         "QA76.642 .A25 1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  conflocation = "Santa Barbara, CA, USA; 17-19 July 1995",
  conftitle =    "Proceedings of Seventh Annual ACM Symposium on
                 Parallel Algorithms and Architectures",
  corpsource =   "California Inst. of Technol., Pasadena, CA, USA",
  sponsor =      "ACM. Special Interest Group on Algorithms and
                 Computation Theory ACM. Special Interest Group on
                 Computer Architecture Theory ACM. Special Interest
                 Group on Computer Architecture European Association for
                 Theoretical Computer Science.",
  sponsororg =   "ACM; EATCS",
  treatment =    "P Practical",
}

@Proceedings{Agrawal:1995:PIW,
  editor =       "D. P. Agrawal",
  booktitle =    "{Proceedings of the 1995 ICPP Workshop on Challenges
                 for Parallel Processing, August 14, 1995, Raleigh, NC,
                 USA}",
  title =        "{Proceedings of the 1995 ICPP Workshop on Challenges
                 for Parallel Processing, August 14, 1995, Raleigh, NC,
                 USA}",
  publisher =    pub-CRC,
  address =      pub-CRC:adr,
  pages =        "vi + 162",
  year =         "1995",
  ISBN =         "0-8493-2618-4",
  ISBN-13 =      "978-0-8493-2618-9",
  LCCN =         "QA76.58.I34 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confsponsor =  "Pennsylvania State Univ",
}

@Proceedings{Aityan:1995:PFI,
  editor =       "S. K. Aityan and L. T. Grujic and R. J. Hathaway and
                 G. S. Ladde and N. Medhin and M. Sambandham",
  booktitle =    "{Proceedings of the First International Conference on
                 Neural, Parallel and Scientific Computations held at
                 Morehouse College, Atlanta, USA, May 28--31, 1995}",
  title =        "{Proceedings of the First International Conference on
                 Neural, Parallel and Scientific Computations held at
                 Morehouse College, Atlanta, USA, May 28--31, 1995}",
  publisher =    "Dynamic Publishers",
  address =      "Atlanta, GA, USA",
  pages =        "xi + 506",
  year =         "1995",
  ISBN =         "0-9640398-9-3 (hardback) 0-9640398-8-5 (paperback)",
  ISBN-13 =      "978-0-9640398-9-6 (hardback) 978-0-9640398-8-9
                 (paperback)",
  LCCN =         "QA76.87 .I58 1995",
  bibdate =      "Wed Apr 16 13:17:34 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Proceedings of Neural Parallel and Scientific
                 Computations 1995",
  acknowledgement = ack-nhfb,
}

@Proceedings{Alnuweiri:1995:PHF,
  editor =       "Hussein M. Alnuweiri and Mounir Hamdi",
  booktitle =    "{Proceedings of HiNet '95: first international
                 workshop on high-speed network computing, April 25,
                 1995, Santa Barbara, California}",
  title =        "{Proceedings of HiNet '95: first international
                 workshop on high-speed network computing, April 25,
                 1995, Santa Barbara, California}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "vii + 119",
  year =         "1995",
  ISBN =         "0-8186-7124-6",
  ISBN-13 =      "978-0-8186-7124-1",
  LCCN =         "TK5105.5 .H56 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE.",
}

@Proceedings{Anonymous:1995:CCS,
  editor =       "Anonymous",
  booktitle =    "{3rd CLIPS conference --- September 1994, Houston,
                 TX}",
  title =        "{3rd CLIPS conference --- September 1994, Houston,
                 TX}",
  publisher =    pub-NASA,
  address =      pub-NASA:adr,
  pages =        "????",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "NASA Publications N N95-19625-647, N95-19747-768",
  acknowledgement = ack-nhfb,
  sponsor =      "United States. National Aeronautics and Space
                 Administration.",
}

@Proceedings{Anonymous:1995:RSS,
  editor =       "Anonymous",
  booktitle =    "{Reservoir simulation: 13th Symposium --- February
                 1995, San Antonio, TX}",
  title =        "{Reservoir simulation: 13th Symposium --- February
                 1995, San Antonio, TX}",
  publisher =    pub-SPE,
  address =      pub-SPE:adr,
  pages =        "????",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Papers --- Society of Petroleum Engineers of AIME",
  acknowledgement = ack-nhfb,
  sponsor =      "American Institute of Mechanical Engineers; Society of
                 Petroleum Engineers.",
}

@Proceedings{ANS:1995:MCR,
  editor =       "{ANS}",
  booktitle =    "{Mathematics and computations, reactor physics, and
                 environmental analyses: International conference ---
                 April 1995, Portland, OR}",
  title =        "{Mathematics and computations, reactor physics, and
                 environmental analyses: International conference ---
                 April 1995, Portland, OR}",
  publisher =    "American Nuclear Society",
  address =      "La Grange Park, IL, USA",
  pages =        "xvi + 1597",
  year =         "1995",
  ISBN =         "0-89448-198-3",
  ISBN-13 =      "978-0-89448-198-7",
  LCCN =         "TK9006.M37 1995",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  acknowledgement = ack-nhfb,
  sponsor =      "American Nuclear Society; Mathematics and Computation
                 Division.",
  xxeditor-1 =   "A. Qaddouri and R. Roy and B. Goulard",
  xxeditor-2 =   "Z. Stankovski",
}

@Proceedings{Arabnia:1995:TRA,
  editor =       "Hamid Arabnia",
  booktitle =    "{Transputer research and applications 7: American
                 Transputer Users Group, October 23--25, 1994, Atlanta,
                 GA (NATUG-7)}",
  title =        "{Transputer research and applications 7: American
                 Transputer Users Group, October 23--25, 1994, Atlanta,
                 GA (NATUG-7)}",
  volume =       "42",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "ix + 349",
  year =         "1995",
  ISBN =         "90-5199-187-8 (IOS Press), 4-274-90017-7 (Ohmsha)",
  ISBN-13 =      "978-90-5199-187-1 (IOS Press), 978-4-274-90017-4
                 (Ohmsha)",
  ISSN =         "0925-4986",
  LCCN =         "????",
  bibdate =      "Mon Jan 15 18:41:48 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Transputer and occam engineering series",
  acknowledgement = ack-nhfb,
}

@Proceedings{Bailey:1995:PSS,
  editor =       "D. H. Bailey and P. E. Bjorstad and J. R. Gilbert and
                 M. V. Mascagni and R. S. Schreiber and H. D. Simon and
                 V. J. Torczon and L. T. Watson",
  booktitle =    "{Proceedings of the Seventh SIAM Conference on
                 Parallel Processing for Scientific Computing (San
                 Francisco, CA, USA)}",
  title =        "{Proceedings of the Seventh SIAM Conference on
                 Parallel Processing for Scientific Computing (San
                 Francisco, CA, USA)}",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "xviii + 875",
  year =         "1995",
  ISBN =         "0-89871-344-7",
  ISBN-13 =      "978-0-89871-344-2",
  LCCN =         "QA76.58.S55 1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "15--17 Feb. 1995",
  conflocation = "San Francisco, CA, USA; 15-17 Feb. 1995",
  conftitle =    "Proceedings of the Seventh SIAM Conference on Parallel
                 Processing for Scientific Computing",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  sponsor =      "Society for Industrial and Applied Mathematics.",
  treatment =    "P Practical",
}

@Proceedings{Bernardi:1995:CCE,
  editor =       "Francesco Bernardi and Jean-Louis Rivail",
  booktitle =    "{Computational chemistry: 1st European conference on
                 computational chemistry (May 1994, Nancy, France)}",
  title =        "{Computational chemistry: 1st European conference on
                 computational chemistry (May 1994, Nancy, France)}",
  number =       "330",
  publisher =    pub-AIP,
  address =      pub-AIP:adr,
  pages =        "various",
  year =         "1995",
  ISBN =         "1-56396-457-0",
  ISBN-13 =      "978-1-56396-457-2",
  ISSN =         "0094-243X (print), 1551-7616 (electronic), 1935-0465",
  LCCN =         "QD39.3.E46 E15 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "AIP Conference Proceedings",
  acknowledgement = ack-nhfb,
  sponsor =      "Federation of European Chemical Societies.",
}

@Proceedings{Bilger:1995:AFM,
  editor =       "R. W. Bilger",
  booktitle =    "{12th Australasian fluid mechanics conference: ---
                 December 1995, Sydney, Australia}",
  title =        "{12th Australasian fluid mechanics conference: --
                 December 1995, Sydney, Australia}",
  publisher =    "University of Sydney",
  address =      "????",
  pages =        "????",
  year =         "1995",
  ISBN =         "0-86934-034-4",
  ISBN-13 =      "978-0-86934-034-9",
  LCCN =         "????",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Australasian Fluid Mechanics Conference 1995; EDIT
                 12//V2",
  acknowledgement = ack-nhfb,
  sponsor =      "University of Sydney.",
}

@Proceedings{Breitenecker:1995:ESC,
  editor =       "Felix Breitenecker and Irmgard Husinsky",
  booktitle =    "{EUROSIM '95: simulation congress: proceedings of the
                 EUROSIM Conference, EUROSIM '95, Vienna, Austria,
                 11--15 September 1995}",
  title =        "{EUROSIM '95: simulation congress: proceedings of the
                 EUROSIM Conference, EUROSIM '95, Vienna, Austria,
                 11--15 September 1995}",
  publisher =    pub-ELS,
  address =      pub-ELS:adr,
  pages =        "xxii + 1356",
  year =         "1995",
  ISBN =         "0-444-82241-0",
  ISBN-13 =      "978-0-444-82241-3",
  LCCN =         "A76.9.C65E966 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  pubcountry =   "Netherlands",
  sponsor =      "Federation of the European Simulation Societies.",
}

@Proceedings{Cantoni:1995:CCA,
  editor =       "Virginio Cantoni and L. Lombardi and M. Mosconi and M.
                 Savini and A. Setti",
  booktitle =    "{CAMP '95, computer architectures for machine
                 perception: proceedings, September 18--20, 1995, Como,
                 Italy}",
  title =        "{CAMP '95, computer architectures for machine
                 perception: proceedings, September 18--20, 1995, Como,
                 Italy}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "x + 461",
  year =         "1995",
  ISBN =         "0-8186-7134-3",
  ISBN-13 =      "978-0-8186-7134-0",
  LCCN =         "QA76.9.A73W675 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95TB8093.",
  acknowledgement = ack-nhfb,
  confsponsor =  "Pavia Univ. --- Dipt. Inf. Sistemistica Centro di
                 Cultura Sci. `A. Volta'; IEEE Comput. Soc. Tech.
                 Committee on Comput. Archit.; IEEE Comput. Soc. Tech.
                 Committee on PAMI; ACM SIGART/SIGARCH; Int. Assoc.
                 Pattern Recognition",
}

@Proceedings{Cook:1995:TAS,
  editor =       "B. M. Cook and M. R. Jane and P. Nixon and P. M.
                 Welch",
  booktitle =    "{Transputer Applications and Systems '95. Proceedings
                 of the 1995 World Transputer Congress, 4--6 September
                 1995, Harrogate, North Yorkshire, UK}",
  title =        "{Transputer Applications and Systems '95. Proceedings
                 of the 1995 World Transputer Congress, 4--6 September
                 1995, Harrogate, North Yorkshire, UK}",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "614",
  year =         "1995",
  ISBN =         "90-5199-235-1 (IOS Press), 4-274-90062-2 (Ohmsha)",
  ISBN-13 =      "978-90-5199-235-9 (IOS Press), 978-4-274-90062-4
                 (Ohmsha)",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 12:07:36 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Dongarra:1995:HPC,
  editor =       "J. J. Dongarra and others",
  booktitle =    "{High performance computing: technology, methods, and
                 applications (Advanced workshop, June 1994, Cetraro,
                 Italy)}",
  title =        "{High performance computing: technology, methods, and
                 applications (Advanced workshop, June 1994, Cetraro,
                 Italy)}",
  volume =       "10",
  publisher =    pub-ELS,
  address =      pub-ELS:adr,
  pages =        "viii + 427",
  year =         "1995",
  ISBN =         "0-444-82163-5",
  ISBN-13 =      "978-0-444-82163-8",
  ISSN =         "0927-5452",
  LCCN =         "QA76.88.H55 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Advances in Parallel Computing",
  acknowledgement = ack-nhfb,
}

@Proceedings{El-Rewini:1995:PTE,
  editor =       "H. El-Rewini and B. D. Shriver",
  booktitle =    "{Proceedings of the Twenty-Eighth Hawaii International
                 Conference on System Sciences}",
  title =        "{Proceedings of the Twenty-Eighth Hawaii International
                 Conference on System Sciences}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "x + 361 (vol. 1), xv + 762 (vol. 2), xv + 600 (vol.
                 3), xx + 1042 (vol. 4), x + 362 (vol. 5)",
  year =         "1995",
  ISBN =         "0-8186-6935-7",
  ISBN-13 =      "978-0-8186-6935-4",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confdate =     "3--6 Jan. 1995",
  conflocation = "Wailea, HI, USA",
  confsponsor =  "Univ. Hawaii; Univ. Hawaii Coll. Bus. Admin.; IEEE
                 Comput. Soc.; ACM; PRISM",
}

@Proceedings{Ferenczi:1995:PAH,
  editor =       "Szabolcs Ferenczi and Peter Kacsuk",
  booktitle =    "{Proceedings of the 2nd Austrian-Hungarian Workshop on
                 Transputer Applications: September 29--October 1, 1994,
                 Budapest, Hungary}",
  title =        "{Proceedings of the 2nd Austrian-Hungarian Workshop on
                 Transputer Applications: September 29--October 1, 1994,
                 Budapest, Hungary}",
  publisher =    "Hungarian Academy of Sciences, Central Research
                 Intitute for Physics",
  address =      "Budapest, Hungary",
  pages =        "vii + 282",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 13:32:12 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Technical report KFKI-1995-2/M,N.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Ferreira:1995:PAI,
  editor =       "Afonso Ferreira and Jose Rolim",
  booktitle =    "{Parallel algorithms for irregularly structured
                 problems: second international workshop, IRREGULAR 95,
                 Lyon, France, September, 4--6, 1995: proceedings}",
  title =        "{Parallel algorithms for irregularly structured
                 problems: second international workshop, IRREGULAR 95,
                 Lyon, France, September, 4--6, 1995: proceedings}",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 409",
  year =         "1995",
  ISBN =         "3-540-60321-2",
  ISBN-13 =      "978-3-540-60321-4",
  LCCN =         "QA76.642.I59 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confsponsor =  "IFIP",
  pubcountry =   "Germany",
}

@Proceedings{Fritzson:1995:PPA,
  editor =       "Peter Fritzson and Leif Finmo",
  booktitle =    "{Parallel programming and applications: proceedings of
                 the Workshop on Parallel Programming and Computation
                 (ZEUS '95) and the 4th Nordic Transputer Conference
                 (NTUG '95): Link{\"o}ping, Sweden}",
  title =        "{Parallel programming and applications: proceedings of
                 the Workshop on Parallel Programming and Computation
                 (ZEUS '95) and the 4th Nordic Transputer Conference
                 (NTUG '95): Link{\"o}ping, Sweden}",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "ix + 435",
  year =         "1995",
  ISBN =         "90-5199-229-7 (IOS Press), 4-274-90056-8 (Ohmsha)",
  ISBN-13 =      "978-90-5199-229-8 (IOS Press), 978-4-274-90056-3
                 (Ohmsha)",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 13:23:58 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Gates:1995:PFI,
  editor =       "W. Lawrence (William Lawrence) Gates",
  booktitle =    "{Proceedings of the First International AMIP
                 Scientific Conference: Monterey, California, USA,
                 15--19 May 1995}",
  title =        "{Proceedings of the First International AMIP
                 Scientific Conference: Monterey, California, USA,
                 15--19 May 1995}",
  number =       "732",
  publisher =    "World Meteorological Organization",
  address =      "Geneva, Switzerland",
  pages =        "viii + 532",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "SIO 1 WO326 v.92",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "World Meteorological Organization --- Publications ---
                 WMO TD 1995",
  acknowledgement = ack-nhfb,
  sponsor =      "Atmospheric Model Intercomparison Project.",
}

@Proceedings{Gray:1995:PCT,
  editor =       "J. P. Gray and F. Naghdy",
  booktitle =    "{Parallel Computing: Technology and Practice. PCAT-94.
                 Proceedings of the 7th Australian Transputer and Occam
                 User Group Conference: Woollongong, NSW, Australia,
                 8--9 November 1994}",
  title =        "{Parallel Computing: Technology and Practice. PCAT-94.
                 Proceedings of the 7th Australian Transputer and Occam
                 User Group Conference: Woollongong, NSW, Australia,
                 8--9 November 1994}",
  publisher =    pub-IOS,
  address =      pub-IOS:adr,
  pages =        "vii + 300",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 12:10:49 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Grinstein:1995:VDE,
  editor =       "Georges G. Grinstein and Robert F. Erbacher",
  booktitle =    "{Visual data exploration and analysis II: 8--10
                 February 1995, San Jose, California}",
  title =        "{Visual data exploration and analysis II: 8--10
                 February 1995, San Jose, California}",
  volume =       "2410",
  publisher =    pub-SPIE,
  address =      pub-SPIE:adr,
  pages =        "viii + 482",
  year =         "1995",
  CODEN =        "PSISDG",
  ISBN =         "0-8194-1757-2",
  ISBN-13 =      "978-0-8194-1757-2",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  LCCN =         "TS510.S63 v.2410",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       j-PROC-SPIE,
  acknowledgement = ack-nhfb,
  confsponsor =  "SPIE",
}

@Proceedings{Hamza:1995:PII,
  editor =       "M. H. Hamza",
  booktitle =    "{Proceedings of the IASTED International Conference.
                 Modelling and Simulation: Pittsburgh, PA, USA, 27--29
                 April 1995}",
  title =        "{Proceedings of the IASTED International Conference.
                 Modelling and Simulation: Pittsburgh, PA, USA, 27--29
                 April 1995}",
  publisher =    "IASTEC-Acta Press",
  address =      "Anaheim, CA, USA",
  pages =        "598",
  year =         "1995",
  ISBN =         "0-88986-218-4",
  ISBN-13 =      "978-0-88986-218-0",
  LCCN =         "QA76.9.C65 I295 1995",
  bibdate =      "Fri Feb 01 06:58:29 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Haridi:1995:EPP,
  editor =       "Seif Haridi and Khayri Ali and Peter Magnusson",
  booktitle =    "{EURO-PAR '95 parallel processing: First International
                 EURO PAR Conference, Stockholm, Sweden, August 29--31,
                 1995: proceedings}",
  title =        "{EURO-PAR '95 parallel processing: First International
                 EURO PAR Conference, Stockholm, Sweden, August 29--31,
                 1995: proceedings}",
  number =       "966",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xv + 730",
  year =         "1995",
  ISBN =         "3-540-60247-X",
  ISBN-13 =      "978-3-540-60247-7",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.I553 1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  corpsource =   "Centro Svizzero de Calcolo Sci., Eidgenossische Tech.
                 Hochschule, Manno, Switzerland",
  pubcountry =   "Germany",
  sponsor =      "Swedish Institute of Computer Science.",
  treatment =    "P Practical",
}

@Proceedings{Hassanzadeh:1995:MMG,
  editor =       "Siamak Hassanzadeh",
  booktitle =    "{Mathematical methods in geophysical imaging III:
                 12--13 July 1995, San Diego, California}",
  title =        "{Mathematical methods in geophysical imaging III:
                 12--13 July 1995, San Diego, California}",
  volume =       "2571",
  publisher =    pub-SPIE,
  address =      pub-SPIE:adr,
  pages =        "vii + 240",
  year =         "1995",
  CODEN =        "PSISDG",
  ISBN =         "0-8194-1930-3",
  ISBN-13 =      "978-0-8194-1930-9",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  LCCN =         "TS510.S63 v.2571",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       j-PROC-SPIE,
  acknowledgement = ack-nhfb,
  confsponsor =  "SPIE",
}

@Proceedings{Hertzberger:1995:HPM,
  editor =       "Bob Hertzberger and Giuseppe Serazzi",
  booktitle =    "{High-Per\-for\-mance computing and networking:
                 International Conference and Exhibition, Milan, Italy,
                 May 3--5, 1995: proceedings}",
  title =        "{High-Per\-for\-mance computing and networking:
                 International Conference and Exhibition, Milan, Italy,
                 May 3--5, 1995: proceedings}",
  number =       "919",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xxiv + 957",
  year =         "1995",
  ISBN =         "3-540-59393-4",
  ISBN-13 =      "978-3-540-59393-5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.88 .I57 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  sponsor =      "High Performance Computing and Networking
                 Foundation.",
}

@Proceedings{Hoffmann:1995:CAP,
  editor =       "Geerd-R. Hoffmann and Norbert Kreitz",
  booktitle =    "{Coming of age: proceedings of the Sixth ECMWF
                 Workshop on the Use of Parallel Processors in
                 Meteorology, Reading, UK, November 21--25, 1994}",
  title =        "{Coming of age: proceedings of the Sixth ECMWF
                 Workshop on the Use of Parallel Processors in
                 Meteorology, Reading, UK, November 21--25, 1994}",
  publisher =    pub-WORLD-SCI,
  address =      pub-WORLD-SCI:adr,
  pages =        "x + 568",
  year =         "1995",
  ISBN =         "981-02-2211-4",
  ISBN-13 =      "978-981-02-2211-6",
  LCCN =         "QC866.E26 1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  pubcountry =   "Singapore",
}

@Proceedings{IEEE:1995:CPI,
  editor =       "{IEEE}",
  booktitle =    "{Conference proceedings of the 1995 IEEE Fourteenth
                 Annual International Phoenix Conference on Computers
                 and Communications: Scottsdale, Arizona, USA, March
                 28--31, 1995}",
  title =        "{Conference proceedings of the 1995 IEEE Fourteenth
                 Annual International Phoenix Conference on Computers
                 and Communications: Scottsdale, Arizona, USA, March
                 28--31, 1995}",
  volume =       "14",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvii + 742",
  year =         "1995",
  ISBN =         "0-7803-2493-5, 0-7803-2492-7, 0-7803-2494-3",
  ISBN-13 =      "978-0-7803-2493-0, 978-0-7803-2492-3,
                 978-0-7803-2494-7",
  LCCN =         "TK7885.A1 I567 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95CH35751.",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE.",
}

@Proceedings{IEEE:1995:DPT,
  editor =       "{IEEE}",
  booktitle =    "{Digest of papers / the Twenty-fifth International
                 Symposium on Fault-Tolerant Computing, June 27--30,
                 1995, Pasadena, California}",
  title =        "{Digest of papers / the Twenty-fifth International
                 Symposium on Fault-Tolerant Computing, June 27--30,
                 1995, Pasadena, California}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxiii + 547",
  year =         "1995",
  ISBN =         "0-8186-7079-7",
  ISBN-13 =      "978-0-8186-7079-4",
  LCCN =         "QA 76.9 F38 I57 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95CB35823.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc. Tech. Committee on Fault-Tolerant
                 Comput.; LAAS-CNRS, France; Univ. Illinois at
                 Urbana-Champaign; Univ. California at Los Angeles; Jep
                 Propulsion Lab.; IFIP WG 10.4",
}

@Proceedings{IEEE:1995:IIC,
  editor =       "{IEEE}",
  booktitle =    "{1995 IEEE International Conference on Systems, Man,
                 and Cybernetics: intelligent systems for the 21st
                 century: Vancouver, British Columbia, Canada, October
                 22--25, 1995}",
  title =        "{1995 IEEE International Conference on Systems, Man,
                 and Cybernetics: intelligent systems for the 21st
                 century: Vancouver, British Columbia, Canada, October
                 22--25, 1995}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "4711",
  year =         "1995",
  ISBN =         "0-7803-2559-1",
  ISBN-13 =      "978-0-7803-2559-3",
  LCCN =         "TA168.I19 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Five volumes. IEEE catalog no. 95CH3576-7.",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1995:IPR,
  editor =       "{IEEE}",
  booktitle =    "{IEEE Pacific Rim Conference on Communications,
                 Computers, and Signal Processing: proceedings / May
                 17--19, 1995, Victoria Conference Centre, Victoria,
                 British Columbia, Canada}",
  title =        "{IEEE Pacific Rim Conference on Communications,
                 Computers, and Signal Processing: proceedings / May
                 17--19, 1995, Victoria Conference Centre, Victoria,
                 British Columbia, Canada}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiv + 638",
  year =         "1995",
  ISBN =         "0-7803-2553-2",
  ISBN-13 =      "978-0-7803-2553-1",
  LCCN =         "TK 5101 A1 I34 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95CH35765.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Victoria Sect.; IEEE Canada; Dept. Comput. Sci.
                 and the Fac. Eng., Univ. Victoria",
}

@Proceedings{IEEE:1995:ISE,
  editor =       "{IEEE}",
  booktitle =    "{Ideas in Science and Electronics Exposition and
                 Symposium. Proceedings: Albuquerque, NM, USA, 9--11 May
                 1995}",
  title =        "{Ideas in Science and Electronics Exposition and
                 Symposium. Proceedings: Albuquerque, NM, USA, 9--11 May
                 1995}",
  volume =       "17",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "64",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Annual Ideas in Science and Electronics Exposition and
                 Symposium Conference",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE.",
}

@Proceedings{IEEE:1995:PEW,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings: Euromicro Workshop on Parallel and
                 Distributed Processing, San Remo, Italy, January
                 25--27, 1995}",
  title =        "{Proceedings: Euromicro Workshop on Parallel and
                 Distributed Processing, San Remo, Italy, January
                 25--27, 1995}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 582",
  year =         "1995",
  ISBN =         "0-8186-7031-2, 0-8186-7032-0",
  ISBN-13 =      "978-0-8186-7031-2, 978-0-8186-7032-9",
  LCCN =         "QA76.58 .E97 1995",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Euromicro Workshop on Parallel and Distributed
                 Processing 1995; 3rd",
  acknowledgement = ack-nhfb,
  xxeditor1 =    "I. Martin and J. C. Fabero and F. Tirado and A.
                 Bautista",
  xxeditor2 =    "V. Gianuzzi and F. Merani",
}

@Proceedings{IEEE:1995:PFI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Fourth IEEE International
                 Symposium on High Performance Distributed Computing,
                 August 2--4, 1995, Washington, DC, USA}",
  title =        "{Proceedings of the Fourth IEEE International
                 Symposium on High Performance Distributed Computing,
                 August 2--4, 1995, Washington, DC, USA}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiv + 246",
  year =         "1995",
  ISBN =         "0-8186-7088-6",
  ISBN-13 =      "978-0-8186-7088-6",
  LCCN =         "QA76.9.D5 I328 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95TB8075.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Tech. Committee on Distrib. Process.; Northeast
                 Parallel Architectures Centre (NPAC) at Syracuse Univ.;
                 ACM SIGCOMM; Rome Lab",
  sponsor =      "IEEE. Computer Society. Technical Committee on
                 Distributed Processing Northeast Parallel Architectures
                 Center.",
}

@Proceedings{IEEE:1995:PIC,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the 15th International Conference on
                 Distributed Computing Systems: Vancouver, BC, Canada,
                 30 May--2 June 1995}",
  title =        "{Proceedings of the 15th International Conference on
                 Distributed Computing Systems: Vancouver, BC, Canada,
                 30 May--2 June 1995}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xx + 537",
  year =         "1995",
  ISBN =         "0-8186-7025-8",
  ISBN-13 =      "978-0-8186-7025-1",
  LCCN =         "????",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95CH35784.",
  acknowledgement = ack-nhfb,
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "A Application; P Practical",
}

@Proceedings{IEEE:1995:PIP,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings / 9th International Parallel Processing
                 Symposium, April 25--28, 1995, Santa Barbara,
                 California}",
  title =        "{Proceedings / 9th International Parallel Processing
                 Symposium, April 25--28, 1995, Santa Barbara,
                 California}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxiii + 851",
  year =         "1995",
  ISBN =         "0-8186-7074-6",
  ISBN-13 =      "978-0-8186-7074-9",
  LCCN =         "QA 76.58 I56 1995",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95TH8052.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc. Tech. Committee on Parallel
                 Process",
}

@Proceedings{IEEE:1995:PNA,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings: the nineteenth annual International
                 Computer Software and Applications Conference (COMPSAC
                 '95): August 9--11, 1995, Dallas, Texas}",
  title =        "{Proceedings: the nineteenth annual International
                 Computer Software and Applications Conference (COMPSAC
                 '95): August 9--11, 1995, Dallas, Texas}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 431",
  year =         "1995",
  ISBN =         "0-8186-7119-X",
  ISBN-13 =      "978-0-8186-7119-7",
  LCCN =         "QA 76.6 C6295 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog no. 95CB35838.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc",
}

@Proceedings{IEEE:1995:PSI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings / Seventh IEEE Symposium on Parallel and
                 Distributed Processing, October 25--28, 1995, San
                 Antonio, Texas}",
  title =        "{Proceedings / Seventh IEEE Symposium on Parallel and
                 Distributed Processing, October 25--28, 1995, San
                 Antonio, Texas}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvii + 724",
  year =         "1995",
  ISBN =         "0-8186-7195-5",
  ISBN-13 =      "978-0-8186-7195-1",
  LCCN =         "QA 76.58 I42 1995",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 95TB8131.",
  acknowledgement = ack-nhfb,
  conflocation = "San Antonio, TX, USA; 25-28 Oct. 1995",
  confsponsor =  "IEEE Comput Soc. Tech. Committee on Comput.
                 Architecture; IEEE Comput. Soc. Tech. Committee on
                 Distributed Process.; IEEE Comput. Soc. Dallas
                 Chapter",
  conftitle =    "Proceedings of Seventh IEEE Symposium on Parallel and
                 Distributed Processing",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  sponsororg =   "IEEE Comput Soc. Tech. Committee on Comput.
                 Architecture; IEEE Comput. Soc. Tech. Committee on
                 Distributed Process.; IEEE Comput. Soc. Dallas
                 Chapter",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1995:PSP,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the 1994 Scalable Parallel Libraries
                 Conference: October 12--14, 1994, Mississippi State
                 University, Mississippi}",
  title =        "{Proceedings of the 1994 Scalable Parallel Libraries
                 Conference: October 12--14, 1994, Mississippi State
                 University, Mississippi}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "vii + 191",
  year =         "1995",
  ISBN =         "0-8186-6895-4",
  ISBN-13 =      "978-0-8186-6895-1",
  LCCN =         "QA76.58 .S34 1994",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  conflocation = "Mississippi State, MS, USA; 12-14 Oct. 1994",
  confsponsor =  "Mississippi State Univ.; NSF",
  conftitle =    "Proceedings Scalable Parallel Libraries Conference",
  corpsource =   "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  sponsororg =   "Mississippi State Univ.; NSF",
  treatment =    "P Practical",
}

@Proceedings{IFIP:1995:KWC,
  editor =       "{IFIP Working Group 2.5}",
  booktitle =    "{Kyoto Workshop 1995: Current Directions in Numerical
                 Software and High Performance Computing, 19--20 October
                 1995, Kyoto, Japan}",
  title =        "{Kyoto Workshop 1995: Current Directions in Numerical
                 Software and High Performance Computing, 19--20 October
                 1995, Kyoto, Japan}",
  publisher =    "????",
  address =      "????",
  pages =        "????",
  year =         "1995",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Jan 24 06:55:27 2001",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.nsc.liu.se/~boein/ifip/kyoto/kyoto.html#reid;
                 http://www.nsc.liu.se/~boein/ifip/kyoto/workshop-info/proceedings/",
  acknowledgement = ack-nhfb,
}

@Proceedings{Levelt:1995:IIS,
  editor =       "A. H. M. Levelt",
  booktitle =    "{ISSAC '95: International symposium on symbolic and
                 algebraic computation --- July 10--12, 1995,
                 Montr{\'e}al, Canada}",
  title =        "{ISSAC '95: International symposium on symbolic and
                 algebraic computation --- July 10--12, 1995,
                 Montr{\'e}al, Canada}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xviii + 314",
  year =         "1995",
  ISBN =         "0-89791-699-9",
  ISBN-13 =      "978-0-89791-699-8",
  LCCN =         "QA 76.95 I59 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "ISSAC --- Proceedings",
  acknowledgement = ack-nhfb,
}

@Proceedings{Malyshkin:1995:PCT,
  editor =       "Victor Malyshkin",
  booktitle =    "{Parallel computing technologies: third international
                 conference, PaCT-95, St. Petersburg, Russia, September
                 12--25, 1995: proceedings}",
  title =        "{Parallel computing technologies: third international
                 conference, PaCT-95, St. Petersburg, Russia, September
                 12--25, 1995: proceedings}",
  number =       "964",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xii + 495",
  year =         "1995",
  ISBN =         "3-540-60222-4",
  ISBN-13 =      "978-3-540-60222-4",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.I547 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  sponsor =      "Russian Academy of Sciences. Computing Center
                 Electrotechnical University of St. Petersburg.",
}

@Proceedings{Nadeau:1995:SVR,
  editor =       "David R. Nadeau and John L. Moreland",
  booktitle =    "{1995 Symposium on the Virtual Reality Modeling
                 Language, VRML '95, San Diego, California, December
                 14--15, 1995}",
  title =        "{1995 Symposium on the Virtual Reality Modeling
                 Language, VRML '95, San Diego, California, December
                 14--15, 1995}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "139",
  year =         "1995",
  ISBN =         "0-89791-818-5",
  ISBN-13 =      "978-0-89791-818-3",
  LCCN =         "QA76.76.H94 S95 1995",
  bibdate =      "Fri Sep 11 08:29:11 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM order number 434953.",
  acknowledgement = ack-nhfb,
  confsponsor =  "San Diego Supercomput. Center; ACM",
  keywords =     "SGML; Virtual reality --- Congresses; VRML (Computer
                 program language) --- Congresses",
}

@Proceedings{Narashimhan:1995:IIF,
  editor =       "V. L. Narashimhan",
  booktitle =    "{ICAPP 95. IEEE First International Conference on
                 Algorithms and Architectures for Parallel Processing,
                 Brisbane, Australia, 19--21 April, 1995}",
  title =        "{ICAPP 95. IEEE First International Conference on
                 Algorithms and Architectures for Parallel Processing,
                 Brisbane, Australia, 19--21 April, 1995}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvii + 961",
  year =         "1995",
  ISBN =         "0-7803-2018-2 (paperback), 0-7803-2019-0
                 (microfiche)",
  ISBN-13 =      "978-0-7803-2018-5 (paperback), 978-0-7803-2019-2
                 (microfiche)",
  LCCN =         "QA76.6.I15 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes. IEEE catalog no. 95TH0682-5.",
  acknowledgement = ack-nhfb,
  confsponsor =  "Parallel Algorithms, Archit. and Software Eng. Res.
                 Lab.; IEEE; IEEE Comput. Soc.; ACM; Euromicro; IBM;
                 Instn. Eng. Australia; Inst. Radio and Electron. Eng.
                 Soc.; Australian Comput. Soc",
}

@Proceedings{Pahl:1995:CCB,
  editor =       "Peter Jan Pahl and Heinrich Werner",
  booktitle =    "{Computing in civil and building engineering: 6th
                 International conference --- July 1995, Berlin}",
  title =        "{Computing in civil and building engineering: 6th
                 International conference --- July 1995, Berlin}",
  publisher =    "A. A. Balkema",
  address =      "Brookfield, VT, USA",
  pages =        "xxiv + 1641",
  year =         "1995",
  ISBN =         "90-5410-556-9, 90-5410-557-7",
  ISBN-13 =      "978-90-5410-556-5, 978-90-5410-557-2",
  LCCN =         "TA345 .I565 1995 v.1-2",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  series =       "Computing in Civil and Building Engineering 6th",
  acknowledgement = ack-nhfb,
  sponsor =      "Arbeitskreis Bauinformatik
                 Technologie-Vermittlungs-Agentur Berlin e.V..",
}

@Proceedings{Pingali:1995:LCP,
  editor =       "K. Pingali and U. Banerjee and D. Gelernter and A.
                 Nicolau and D. Padua",
  booktitle =    "{Languages and compilers for parallel computing: 7th
                 International Workshop, Ithaca, NY, USA, August 8--10,
                 1994: proceedings}",
  title =        "{Languages and compilers for parallel computing: 7th
                 International Workshop, Ithaca, NY, USA, August 8--10,
                 1994: proceedings}",
  volume =       "892",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xl + 496",
  year =         "1995",
  ISBN =         "3-540-58868-X",
  ISBN-13 =      "978-3-540-58868-9",
  LCCN =         "QA76.58 .W656 1994",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  pubcountry =   "Germany",
}

@Proceedings{Prasanna:1995:FIP,
  editor =       "Viktor K. Prasanna and V. P. Bhatkar and L. M. Patnaik
                 and S. K. Tripathi",
  booktitle =    "{First IWPP parallel processing: proceedings of the
                 First International Workshop on Parallel Processing
                 (IWPP-94): December 26--31, 1994, Bangalore, India}",
  title =        "{First IWPP parallel processing: proceedings of the
                 First International Workshop on Parallel Processing
                 (IWPP-94): December 26--31, 1994, Bangalore, India}",
  publisher =    "Taka McGraw-Hill Pub. Co",
  address =      "New Delhi; New York",
  pages =        "xxiii + 736",
  year =         "1995",
  ISBN =         "0-07-462332-X",
  ISBN-13 =      "978-0-07-462332-9",
  LCCN =         "QA 76.58 I587 1994",
  bibdate =      "Wed Apr 16 14:07:03 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Satofuka:1995:PCF,
  editor =       "N. Satofuka and Jacques Periaux and Akin Ecer",
  booktitle =    "{Parallel computational fluid dynamics: new algorithms
                 and applications: proceedings of the Parallel CFD '94
                 Conference, Kyoto, Japan, 16--19 May 1994}",
  title =        "{Parallel computational fluid dynamics: new algorithms
                 and applications: proceedings of the Parallel CFD '94
                 Conference, Kyoto, Japan, 16--19 May 1994}",
  publisher =    pub-ELS,
  address =      pub-ELS:adr,
  pages =        "xi + 457",
  year =         "1995",
  ISBN =         "0-444-82317-4",
  ISBN-13 =      "978-0-444-82317-5",
  LCCN =         "QA911 .P35 1994",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "fluid dynamics -- data processing -- congresses;
                 parallel processing (electronic computers) --
                 congresses; supercomputers -- congresses",
}

@Proceedings{Shaw:1995:ADA,
  editor =       "R. A. (Richard A.) Shaw and H. E. (Harry E.) Payne and
                 J. J. E. (Jeffrey J. E.) Hayes",
  booktitle =    "{Astronomical data analysis software and systems IV:
                 meeting held at Baltimore, Maryland, 25--28 September
                 1994}",
  title =        "{Astronomical data analysis software and systems IV:
                 meeting held at Baltimore, Maryland, 25--28 September
                 1994}",
  volume =       "77",
  publisher =    "Astronomical Society of the Pacific",
  address =      "San Francisco, CA, USA",
  pages =        "xxxvi + 533",
  year =         "1995",
  ISBN =         "0-937707-96-1",
  ISBN-13 =      "978-0-937707-96-8",
  ISSN =         "1080-7926",
  LCCN =         "QB51.3.E43 A87 1994",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Astronomical Society of the Pacific Conference
                 Series",
  acknowledgement = ack-nhfb,
  sponsor =      "Astronomical Society of the Pacific.",
}

@Proceedings{Tentner:1995:HPC,
  editor =       "A. Tentner",
  booktitle =    "{High Performance Computing Symposium 1995 `Grand
                 Challenges in Computer Simulation'. Proceedings of the
                 1995 Simulation Multiconference: Phoenix, AZ, USA,
                 9--13 April 1995}",
  title =        "{High Performance Computing Symposium 1995 `Grand
                 Challenges in Computer Simulation'. Proceedings of the
                 1995 Simulation Multiconference: Phoenix, AZ, USA,
                 9--13 April 1995}",
  publisher =    "Society for Computer Simulation",
  address =      "San Diego, CA, USA",
  pages =        "xxiii + 566",
  year =         "1995",
  ISBN =         "1-56555-078-1",
  ISBN-13 =      "978-1-56555-078-0",
  LCCN =         "????",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  sponsororg =   "SCS",
  treatment =    "P Practical",
}

@Proceedings{Uselton:1995:PRS,
  editor =       "Samuel P. Uselton and Michael Brian Cox and Craig M.
                 Wittenbrink",
  booktitle =    "{1995 Parallel Rendering Symposium (PRS 95): Atlanta,
                 Georgia, October 30--31, 1995}",
  title =        "{1995 Parallel Rendering Symposium (PRS 95): Atlanta,
                 Georgia, October 30--31, 1995}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "107",
  year =         "1995",
  ISBN =         "0-89791-774-1 (softbound) [invalid checksum],
                 0-7803-3120-6 (microfiche)",
  ISBN-13 =      "978-0-89791-774-2 (softbound), 978-0-7803-3120-4
                 (microfiche)",
  LCCN =         "QA76.58.P3778 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM order number 428957. IEEE Computer Society Press
                 order number 95TB8134.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE Comput. Soc. Techn. Committee on Comput.
                 Graphics; ACM SIGGRAPH",
}

@Proceedings{USENIX:1995:PUT,
  editor =       "{USENIX}",
  booktitle =    "{Proceedings of the 1995 USENIX Technical Conference,
                 January 16--20, 1995, New Orleans, Louisiana, USA}",
  title =        "{Proceedings of the 1995 USENIX Technical Conference,
                 January 16--20, 1995, New Orleans, Louisiana, USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "325",
  year =         "1995",
  ISBN =         "1-880446-67-7",
  ISBN-13 =      "978-1-880446-67-6",
  LCCN =         "QA 76.76 O63 U88 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Vandoni:1995:CSC,
  editor =       "C. E. Vandoni and C. Verkerk",
  booktitle =    "{1994 CERN School of Computing: Sopron, Hungary, 28
                 August--10 September 1994: proceedings}",
  title =        "{1994 CERN School of Computing: Sopron, Hungary, 28
                 August--10 September 1994: proceedings}",
  publisher =    "CERN",
  address =      "Geneva, Switzerland",
  pages =        "ix + 336",
  year =         "1995",
  ISBN =         "92-9083-069-7",
  ISBN-13 =      "978-92-9083-069-6",
  bibdate =      "Sun Dec 22 10:20:45 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "CERN report 95-01.",
  acknowledgement = ack-nhfb,
  pubcountry =   "Switzerland",
}

@Proceedings{VanKatwijk:1995:AAC,
  editor =       "Jan {Van Katwijk}",
  booktitle =    "{ACSCI '95: 1st Annual conference --- May 1995,
                 Heijen, The Netherlands}",
  title =        "{ACSCI '95: 1st Annual conference --- May 1995,
                 Heijen, The Netherlands}",
  publisher =    "ASCI",
  address =      "Delft, The Netherlands",
  pages =        "xi + 450",
  year =         "1995",
  ISBN =         "90-90-08344-8",
  ISBN-13 =      "978-90-90-08344-5",
  LCCN =         "QA75.5 .A38x 1995",
  bibdate =      "Thu Feb 29 17:59:11 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Proceedings of the Annual Conference --- Advanced
                 School for Computing and Imaging, 1st",
  acknowledgement = ack-nhfb,
  sponsor =      "Advanced School for Computing and Imaging",
}

@Proceedings{Abrahart:1996:GIC,
  editor =       "R. J. Abrahart",
  booktitle =    "{GeoComputation 96. 1st International Conference on
                 GeoComputation: Leeds, UK, 17--19 September 1996}",
  title =        "{GeoComputation 96. 1st International Conference on
                 GeoComputation: Leeds, UK, 17--19 September 1996}",
  publisher =    "????",
  address =      "????",
  pages =        "????",
  year =         "1996",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 14:19:17 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1996:FCP,
  editor =       "{ACM}",
  booktitle =    "{FCRC '96: Conference proceedings of the 1996
                 International Conference on Supercomputing:
                 Philadelphia, Pennsylvania, USA, May 25--28, 1996}",
  title =        "{FCRC '96: Conference proceedings of the 1996
                 International Conference on Supercomputing:
                 Philadelphia, Pennsylvania, USA, May 25--28, 1996}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xii + 406",
  year =         "1996",
  ISBN =         "0-89791-803-7",
  ISBN-13 =      "978-0-89791-803-9",
  LCCN =         "QA76.5 I61 1996",
  bibdate =      "Wed Mar 18 12:33:29 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM order number 415961.",
  acknowledgement = ack-nhfb,
  keywords =     "Supercomputers --- Congresses.",
}

@Proceedings{ACM:1996:SCP,
  editor =       "{ACM}",
  booktitle =    "{Supercomputing '96 Conference Proceedings: November
                 17--22, Pittsburgh, PA}",
  title =        "{Supercomputing '96 Conference Proceedings: November
                 17--22, Pittsburgh, PA}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1996",
  ISBN =         "0-89791-854-1",
  ISBN-13 =      "978-0-89791-854-1",
  LCCN =         "QA 76.88 S8573 1996",
  bibdate =      "Tue May 12 08:55:21 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM Order Number: 415962, IEEE Computer Society Press
                 Order Number: RS00126.",
  URL =          "http://www.supercomp.org/sc96/proceedings/",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1996:SVR,
  editor =       "{ACM}",
  booktitle =    "{1995 Symposium on the Virtual Reality Modeling
                 Language (VRML `95)}",
  title =        "{1995 Symposium on the Virtual Reality Modeling
                 Language (VRML `95)}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "139",
  year =         "1996",
  ISBN =         "0-89791-818-5",
  ISBN-13 =      "978-0-89791-818-3",
  LCCN =         "????",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.acm.org/pubs/contents/proceedings/graph/217306/",
  acknowledgement = ack-nhfb,
  conflocation = "San Diego, CA, USA; 14-15 Dec. 1995",
  conftitle =    "Proceedings of 1995 VMRL Workshop",
  corpsource =   "Visual Comput. Lab., California Univ., San Diego, La
                 Jolla, CA, USA",
  sponsororg =   "San Diego Supercomput. Center; ACM",
  treatment =    "P Practical",
}

@Proceedings{Bode:1996:PVM,
  editor =       "Arndt Bode and Jack Dongarra and T. Ludwig and V.
                 Sunderam",
  booktitle =    "{Parallel virtual machine, EuroPVM '96: third European
                 PVM conference, Munich, Germany, October 7--9, 1996:
                 proceedings}",
  title =        "{Parallel virtual machine, EuroPVM '96: third European
                 PVM conference, Munich, Germany, October 7--9, 1996:
                 proceedings}",
  volume =       "1156",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xiv + 362",
  year =         "1996",
  ISBN =         "3-540-61779-5",
  ISBN-13 =      "978-3-540-61779-2",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.E975 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  conflocation = "Munich, Germany; 7-9 Oct. 1996",
  conftitle =    "Parallel Virtual Machine - EuroPVM '96. Third European
                 PVM Conference. Proceedings",
  corpsource =   "Computations and Commun. Res. Labs., NEC Europe Ltd.,
                 Sankt Augustin, Germany",
  keywords =     "Parallel computers -- Congresses; Virtual computer
                 systems -- Congresses.",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Proceedings{Boszormenyi:1996:PCT,
  editor =       "Laszlo Boszormenyi",
  booktitle =    "{Parallel computation: Third International ACPC
                 Conference with special emphasis on parallel databases
                 and parallel I/O, Klagenfurt, Austria, September
                 23--25, 1996: proceedings}",
  title =        "{Parallel computation: Third International ACPC
                 Conference with special emphasis on parallel databases
                 and parallel I/O, Klagenfurt, Austria, September
                 23--25, 1996: proceedings}",
  volume =       "1127",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xi + 234",
  year =         "1996",
  ISBN =         "3-540-61695-0",
  ISBN-13 =      "978-3-540-61695-5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA267.A1 L43 no.1127",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  keywords =     "parallel processing (electronic computers) --
                 congresses",
}

@Proceedings{Bouge:1996:EPP,
  editor =       "Luc Bouge and P. Fraigniaud and A. Mignotte and Y.
                 Robert",
  booktitle =    "{Euro-Par '96 parallel processing: second
                 International Euro-Par Conference, Lyon, France, August
                 26--29, 1996: proceedings}",
  title =        "{Euro-Par '96 parallel processing: second
                 International Euro-Par Conference, Lyon, France, August
                 26--29, 1996: proceedings}",
  volume =       "1123--1124",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xxxiii + 842 (vol. 1), 926 (vol. 2)",
  year =         "1996",
  ISBN =         "3-540-61626-8 (vol. 1), 3-540-61627-6 (vol. 2)",
  ISBN-13 =      "978-3-540-61626-9 (vol. 1), 978-3-540-61627-6 (vol.
                 2)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.I554 1996, QA267.A1 L43 no.1123-1124",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Two volumes.",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  conflocation = "Lyon, France; 26-29 Aug. 1996",
  conftitle =    "Proceedings of European Conference on Parallel
                 Processing EURO-PAR '96",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "parallel processing (electronic computers) --
                 congresses",
  pubcountry =   "Germany",
  treatment =    "P Practical",
}

@Proceedings{Ciancarini:1996:CLM,
  editor =       "Paolo Ciancarini and Chris Hankin",
  booktitle =    "{Coordination languages and models: First
                 International Conference COORDINATION '96, Cesena,
                 Italy, April 15--17, 1996: proceedings}",
  title =        "{Coordination languages and models: First
                 International Conference COORDINATION '96, Cesena,
                 Italy, April 15--17, 1996: proceedings}",
  number =       "1061",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xi + 443",
  year =         "1996",
  ISBN =         "3-540-61052-9",
  ISBN-13 =      "978-3-540-61052-6",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.I52 1996",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
}

@Proceedings{Dongarra:1996:APC,
  editor =       "Jack J. Dongarra and Kay Madsen and Jerzy Wasniewski",
  booktitle =    "{Applied parallel computing: computations in physics,
                 chemistry, and engineering science: second
                 international workshop, PARA '95, Lyngby, Denmark,
                 August 21--24, 1995: proceedings}",
  title =        "{Applied parallel computing: computations in physics,
                 chemistry, and engineering science: second
                 international workshop, PARA '95, Lyngby, Denmark,
                 August 21--24, 1995: proceedings}",
  volume =       "1041",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "562",
  year =         "1996",
  ISBN =         "3-540-60902-4",
  ISBN-13 =      "978-3-540-60902-5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.P35 1995",
  bibdate =      "Wed Aug 14 10:49:23 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  sponsor =      "Danish Computing Centre for Research and Education
                 Technical University of Denmark. Institute of
                 Mathematical Modeling Danish Natural Science Research
                 Council.",
}

@Proceedings{El-Rewini:1996:PTN,
  editor =       "Hesham El-Rewini and Bruce D. Shriver",
  booktitle =    "{Proceedings of the Twenty-Ninth Hawaii International
                 Conference on System Sciences (HICSS-29): Wailea, HI,
                 USA, 3--6 January 1996}",
  title =        "{Proceedings of the Twenty-Ninth Hawaii International
                 Conference on System Sciences (HICSS-29): Wailea, HI,
                 USA, 3--6 January 1996}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "various",
  year =         "1996",
  ISBN =         "0-8186-7324-9",
  ISBN-13 =      "978-0-8186-7324-5",
  ISSN =         "1060-3425",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 14:12:08 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Five volumes.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Grangeat:1996:PTI,
  editor =       "Pierre Grangeat and Jean-Louis Amans",
  booktitle =    "{Proceedings of the Third International Meeting on
                 Fully Three-Dimensional Image Reconstruction in
                 Radiology and Nuclear Medicine, held July 4--6, 1995 at
                 Domaine d'Aix-Marlioz, Aix-les-Bains, France}",
  title =        "{Proceedings of the Third International Meeting on
                 Fully Three-Dimensional Image Reconstruction in
                 Radiology and Nuclear Medicine, held July 4--6, 1995 at
                 Domaine d'Aix-Marlioz, Aix-les-Bains, France}",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "x + 315",
  year =         "1996",
  ISBN =         "0-7923-4129-5",
  ISBN-13 =      "978-0-7923-4129-1",
  LCCN =         "R857.T47 T485 1996",
  bibdate =      "Wed Apr 16 10:20:43 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Grinstein:1996:VDE,
  editor =       "Georges G. Grinstein and Robert F. Erbacher",
  booktitle =    "{Visual data exploration and analysis III: 31
                 January--2 February, 1996, San Jose, California}",
  title =        "{Visual data exploration and analysis III: 31
                 January--2 February, 1996, San Jose, California}",
  volume =       "2421 (or 2656??)",
  publisher =    pub-SPIE,
  address =      pub-SPIE:adr,
  pages =        "ix + 404",
  year =         "1996",
  CODEN =        "PSISDG",
  ISBN =         "0-8194-2030-1",
  ISBN-13 =      "978-0-8194-2030-5",
  ISSN =         "0277-786X (print), 1996-756X (electronic)",
  LCCN =         "TS510.S63 v.2656",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       j-PROC-SPIE,
  acknowledgement = ack-nhfb,
  confsponsor =  "SPIE; Soc. Imaginag Sci. and Technol",
}

@Proceedings{IEEE:1996:EIS,
  editor =       "{IEEE}",
  booktitle =    "{Eighth IEEE Symposium on Parallel and Distributed
                 Processing: October 23--26, 1996, New Orleans,
                 Louisiana}",
  title =        "{Eighth IEEE Symposium on Parallel and Distributed
                 Processing: October 23--26, 1996, New Orleans,
                 Louisiana}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xv + 618",
  year =         "1996",
  ISBN =         "0-8186-7683-3, 0-8186-7685-X (microfiche)",
  ISBN-13 =      "978-0-8186-7683-3, 978-0-8186-7685-7 (microfiche)",
  LCCN =         "QA76.58 .I42 1996",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE Computer Society Press order number PR07683. IEEE
                 Order Plan catalog number 96TB100088.",
  acknowledgement = ack-nhfb,
  keywords =     "electronic data processing -- distributed processing
                 -- congresses; parallel processing (electronic
                 computers) -- congresses",
}

@Proceedings{IEEE:1996:FSS,
  editor =       "{IEEE}",
  booktitle =    "{Frontiers'96, the Sixth Symposium on the Frontiers of
                 Massively Parallel Computation: October 27--31, 1996,
                 Annapolis, Maryland: proceedings}",
  title =        "{Frontiers'96, the Sixth Symposium on the Frontiers of
                 Massively Parallel Computation: October 27--31, 1996,
                 Annapolis, Maryland: proceedings}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiv + 372",
  year =         "1996",
  ISBN =         "0-8186-7551-9",
  ISBN-13 =      "978-0-8186-7551-5",
  LCCN =         "QA76.58 .S95 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 96TB100062.",
  acknowledgement = ack-nhfb,
  corpsource =   "Numerical Aerodynamic Simulation, NASA Ames Res.
                 Center, Moffett Field, CA, USA; Centro Svizzero di
                 Calcolo Sci., Manno, Switzerland",
  sponsororg =   "IEEE Comput. Soc.; NASA Goddard Space Flight Center;
                 URSA/CESDIS",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1996:ICH,
  editor =       "{IEEE}",
  booktitle =    "{3rd International Conference on High Performance
                 Computing: proceedings, December 19--22, 1996,
                 Trivandrum, India}",
  title =        "{3rd International Conference on High Performance
                 Computing: proceedings, December 19--22, 1996,
                 Trivandrum, India}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvi + 476",
  year =         "1996",
  ISBN =         "0-8186-7557-8",
  ISBN-13 =      "978-0-8186-7557-7",
  LCCN =         "QA76.88.I575 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 96TB100074.",
  acknowledgement = ack-nhfb,
  conflocation = "Trivandrum, India; 19-22 Dec. 1996",
  conftitle =    "Proceedings of 3rd International Conference on High
                 Performance Computing (HiPC)",
  corpsource =   "Software Technol. Group, Swiss Center for Sci.
                 Comput., Manno, Switzerland; Div. of Math. and Comput.
                 Sci., Argonne Nat. Lab., IL, USA",
  sponsororg =   "IEEE Comput. Soc.; IEEE Comput. Soc. Tech. Committee
                 on Parallel Process.; ACM SIGARCH",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1996:PFE,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the fourth Euromicro Workshop on
                 Parallel and Distributed Processing (PDP '96): January
                 24--26, 1996, Braga, Portugal}",
  title =        "{Proceedings of the fourth Euromicro Workshop on
                 Parallel and Distributed Processing (PDP '96): January
                 24--26, 1996, Braga, Portugal}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xv + 551",
  year =         "1996",
  ISBN =         "0-8186-7376-1",
  ISBN-13 =      "978-0-8186-7376-4",
  LCCN =         "QA76.58 .E97 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE order number PR07376.",
  acknowledgement = ack-nhfb,
  conflocation = "Braga, Portugal; 24-26 Jan. 1996",
  conftitle =    "Proceedings of 4th Euromicro Workshop on Parallel and
                 Distributed Processing",
  corpsource =   "Coimbra Univ., Portugal",
  keywords =     "electronic data processing -- distributed processing
                 -- congresses; parallel processing (electronic
                 computers) -- congresses; parallel programming
                 (computer science) -- congresses",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1996:PFI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Fifth IEEE International Symposium
                 on High Performance Distributed Computing, Syracuse,
                 NY, USA, 6--9 August 1996}",
  title =        "{Proceedings of the Fifth IEEE International Symposium
                 on High Performance Distributed Computing, Syracuse,
                 NY, USA, 6--9 August 1996}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 642",
  year =         "1996",
  ISBN =         "0-8186-7582-9",
  ISBN-13 =      "978-0-8186-7582-9",
  LCCN =         "QA 76.88 I52 1996",
  bibdate =      "Tue May 12 08:55:41 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number TB100069.",
  acknowledgement = ack-nhfb,
  corpsource =   "NSF Eng. Res. Center for Comput. Field Simulation,
                 Mississippi State Univ., MS, USA",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process.; Northeast Parallel Architectures Center; New
                 York State Center for Adv. Technol. Comput.
                 Applications and Software Eng. (CASE Center) at
                 Syracuse Univ.; Rome Lab",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1996:PII,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of IPPS '96. The 10th International
                 Parallel Processing Symposium: Honolulu, HI, USA,
                 15--19 April 1996}",
  title =        "{Proceedings of IPPS '96. The 10th International
                 Parallel Processing Symposium: Honolulu, HI, USA,
                 15--19 April 1996}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxviii + 903",
  year =         "1996",
  ISBN =         "0-8186-7255-2",
  ISBN-13 =      "978-0-8186-7255-2",
  LCCN =         "QA76.58 .I565 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 96TB100038. IEEE Computer Society
                 Press order number PR07255.",
  acknowledgement = ack-nhfb,
  conflocation = "Honolulu, HI, USA; 15-19 April 1996",
  conftitle =    "Proceedings of International Conference on Parallel
                 Processing",
  corpsource =   "Mississippi State Univ., MS, USA; Inst. fur Inf.,
                 Tech. Univ. Munchen, Germany",
  sponsororg =   "IEEE Comput. Tech. Committee on Parallel Process.; ACM
                 SIGARCH",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1996:PIS,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of 1996 IEEE Second International
                 Conference on Algorithms and Architectures for Parallel
                 Processing, ICA PP '96: June 11--13, 1996, Singapore}",
  title =        "{Proceedings of 1996 IEEE Second International
                 Conference on Algorithms and Architectures for Parallel
                 Processing, ICA PP '96: June 11--13, 1996, Singapore}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xix + 547",
  year =         "1996",
  ISBN =         "0-7803-3529-5 (softbound), 0-7803-3530-9
                 (microfiche)",
  ISBN-13 =      "978-0-7803-3529-5 (softbound), 978-0-7803-3530-1
                 (microfiche)",
  LCCN =         "QA76.58.I33 1996",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 96TH8204.",
  acknowledgement = ack-nhfb,
  keywords =     "electronic digital computers -- programming --
                 congresses; multiprocessors -- programming --
                 congresses; parallel processing (electronic computers)
                 -- congresses",
}

@Proceedings{IEEE:1996:PSI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Seventh Israeli Conference on
                 Computer Systems and Software Engineering: June 12--13,
                 1996, Herzliya, Israel}",
  title =        "{Proceedings of the Seventh Israeli Conference on
                 Computer Systems and Software Engineering: June 12--13,
                 1996, Herzliya, Israel}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "viii + 151",
  year =         "1996",
  ISBN =         "0-8186-7536-5",
  ISBN-13 =      "978-0-8186-7536-2",
  LCCN =         "QA75.5 .I75 1996",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE Computer Society Press Order Number PR07536.",
  acknowledgement = ack-nhfb,
  keywords =     "software engineering -- Israel -- congresses; system
                 design -- congresses",
}

@Proceedings{IEEE:1996:PSM,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings. Second MPI Developer's Conference: Notre
                 Dame, IN, USA, 1--2 July 1996}",
  title =        "{Proceedings. Second MPI Developer's Conference: Notre
                 Dame, IN, USA, 1--2 July 1996}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 207",
  year =         "1996",
  ISBN =         "0-8186-7533-0",
  ISBN-13 =      "978-0-8186-7533-1",
  LCCN =         "QA76.642 .M67 1996",
  bibdate =      "Tue May 12 08:56:04 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
}

@Proceedings{Jacoby:1996:ADA,
  editor =       "G. H. (George H.) Jacoby and Jeannette V. Barnes",
  booktitle =    "{Astronomical data analysis software and systems V:
                 meeting held at Tucson, Arizona, 23--25 October 1995}",
  title =        "{Astronomical data analysis software and systems V:
                 meeting held at Tucson, Arizona, 23--25 October 1995}",
  volume =       "101",
  publisher =    "Astronomical Society of the Pacific",
  address =      "San Francisco, CA, USA",
  pages =        "xxxvii + 607",
  year =         "1996",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "1080-7926",
  LCCN =         "QB51.3.E43 A87 1995",
  bibdate =      "Wed Apr 16 14:14:55 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Astronomical Society of the Pacific Conference
                 Series",
  acknowledgement = ack-nhfb,
}

@Proceedings{Jain:1996:IOP,
  editor =       "Ravi Jain and John Werth and James C. Browne",
  booktitle =    "{Input\slash output and parallel and distributed
                 computer systems}",
  title =        "{Input\slash output and parallel and distributed
                 computer systems}",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xiv + 395",
  year =         "1996",
  ISBN =         "0-7923-9735-5",
  ISBN-13 =      "978-0-7923-9735-9",
  LCCN =         "QA76.58.I485 1996",
  bibdate =      "Mon Apr 21 11:26:01 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Li:1996:PSI,
  editor =       "G.-J. Li and D. F. Hsu and S. Horiguchi and B. Maggs",
  booktitle =    "{Proceedings. Second International Symposium on
                 Parallel Architectures, Algorithms, and Networks
                 (I-SPAN '96): June 12--14, 1996, Beijing, China}",
  title =        "{Proceedings. Second International Symposium on
                 Parallel Architectures, Algorithms, and Networks
                 (I-SPAN '96): June 12--14, 1996, Beijing, China}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 567",
  year =         "1996",
  ISBN =         "0-8186-7460-1",
  ISBN-13 =      "978-0-8186-7460-0",
  LCCN =         "QA76.58.I5673 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 96TB100044.",
  acknowledgement = ack-nhfb,
  corpsource =   "Dept. of Comput. Sci., Australian Nat. Univ.,
                 Canberra, ACT, Australia",
  sponsororg =   "Chinese Nat. Res. Center for Intelligent Comput.
                 Syst.; IEEE Comput. Soc.; IEEE Comput. Soc. Tech.
                 Committee on Parallel Process.; Steering Committee of
                 the Chinese Nat. Hi-Tech Programme; Inf. Process. Soc.
                 Japan; Chinese Comput. Federation; IEICE Inf. and Syst.
                 Soc",
  treatment =    "P Practical",
}

@Proceedings{Li:1996:SIS,
  editor =       "Guo-Jie Li",
  booktitle =    "{Second International Symposium on Parallel
                 Architectures, Algorithms, and Networks (I-SPAN '96):
                 proceedings, June 12--14, 1996, Beijing, China}",
  title =        "{Second International Symposium on Parallel
                 Architectures, Algorithms, and Networks (I-SPAN '96):
                 proceedings, June 12--14, 1996, Beijing, China}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 567",
  year =         "1996",
  ISBN =         "0-8186-7460-1",
  ISBN-13 =      "978-0-8186-7460-0",
  LCCN =         "QA76.58.I565 1996",
  bibdate =      "Sat Oct 21 15:20:00 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 94TH0697-3.",
  acknowledgement = ack-nhfb,
  keywords =     "computer algorithms -- congresses; computer
                 architecture -- congresses; computer networks --
                 congreses; parallel processing (electronic computers)
                 -- congresses",
}

@Proceedings{Liddell:1996:HPC,
  editor =       "Heather Mary Liddell and A. Colbrook and B.
                 Hertzberger and P. Sloot",
  booktitle =    "{High-performance computing and networking:
                 international conference and exhibition, HPCN EUROPE
                 1966, Brussels, Belgium, April 15--19, 1996:
                 proceedings}",
  title =        "{High-performance computing and networking:
                 international conference and exhibition, HPCN EUROPE
                 1966, Brussels, Belgium, April 15--19, 1996:
                 proceedings}",
  volume =       "1067",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xxv + 1040",
  year =         "1996",
  ISBN =         "3-540-61142-8 (paperback)",
  ISBN-13 =      "978-3-540-61142-4 (paperback)",
  LCCN =         "QA76.88 .H52 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  conflocation = "Brussels, Belgium; 15-19 April 1996",
  conftitle =    "High-Performance Computing and Networking.
                 International Conference and Exhibition HPCN Europe
                 1996",
  corpsource =   "Zentrum fur Paralleles Rechnen, Koln Univ., Germany;
                 German Nat. Res. Center for Inf. Technol., St.
                 Augustin, Germany; Dept. of Electron. and Comput. Sci.,
                 Southampton Univ., UK; Dept. of Inf., Basel Univ.,
                 Switzerland",
  keywords =     "computer networks -- congresses; supercomputers --
                 congresses",
  pubcountry =   "Germany",
  treatment =    "T Theoretical or Mathematical; P Practical",
}

@Proceedings{Reeves:1996:PIC,
  editor =       "A. Reeves",
  booktitle =    "{Proceedings of the 1996 International Conference on
                 Challenges for Parallel Processing, Ithaca, NY, USA,
                 August 12, 1996}",
  title =        "{Proceedings of the 1996 International Conference on
                 Challenges for Parallel Processing, Ithaca, NY, USA,
                 August 12, 1996}",
  volume =       "1",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvi + 278 (vol. 1), xv + 173 (vol. 2), 230 (vol. 3)",
  year =         "1996",
  ISBN =         "0-8186-7623-X",
  ISBN-13 =      "978-0-8186-7623-9",
  LCCN =         "QA76.58 .I34 1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Three volumes.",
  acknowledgement = ack-nhfb,
  conftitle =    "Proceedings of 25th International Conference on
                 Parallel Processing",
  corpsource =   "Comput. Sci. Div., Berkeley Univ., CA, USA",
  sponsororg =   "Int. Assoc. Comput. and Commun.; Pennsylvania State
                 Univ",
  treatment =    "P Practical",
  xxeditor =     "Howard Jay Segal",
}

@Proceedings{Silvester:1996:SEE,
  editor =       "P. P. Silvester",
  booktitle =    "{Software for electrical engineering analysis and
                 design: Third International Conference on Software for
                 Electrical Engineering Analysis and Design, Electrosoft
                 '96, Pisa, Italy}",
  title =        "{Software for electrical engineering analysis and
                 design: Third International Conference on Software for
                 Electrical Engineering Analysis and Design, Electrosoft
                 '96, Pisa, Italy}",
  publisher =    "Computational Mechanics Publications",
  address =      "Boston, MA, USA",
  pages =        "509",
  year =         "1996",
  ISBN =         "1-85312-395-1",
  ISBN-13 =      "978-1-85312-395-5",
  LCCN =         "TK5.I59 1996",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "electric engineering -- computer programs --
                 congresses",
}

@Proceedings{Szymanski:1996:LCR,
  editor =       "Boleslaw K. Szymanski and Balaram Sinharoy",
  booktitle =    "{Languages, Compilers and Run-Time Systems for
                 Scalable Computers, 22--24 May 1995, Troy, NY, USA}",
  title =        "{Languages, Compilers and Run-Time Systems for
                 Scalable Computers, 22--24 May 1995, Troy, NY, USA}",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xiv + 335",
  year =         "1996",
  ISBN =         "0-7923-9635-9",
  ISBN-13 =      "978-0-7923-9635-2",
  LCCN =         "QA76.58.L37 1996",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Toussaint:1996:AES,
  editor =       "Marcel Toussaint",
  booktitle =    "{Ada in Europe: Second International
                 Eurospace-Ada-Europe Symposium, Frankfurt\slash Main,
                 Germany, October 2--6, 1995: proceedings}",
  title =        "{Ada in Europe: Second International
                 Eurospace-Ada-Europe Symposium, Frankfurt\slash Main,
                 Germany, October 2--6, 1995: proceedings}",
  number =       "1031",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xi + 455",
  year =         "1996",
  ISBN =         "3-540-60757-9",
  ISBN-13 =      "978-3-540-60757-1",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.73.A35I57 1995",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  sponsor =      "Eurospace Ada-Europe.",
}

@Proceedings{Wasniewski:1996:APC,
  editor =       "Jerzy Wasniewski",
  booktitle =    "{Applied parallel computing: industrial computation
                 and optimization: Third International Workshop, PARA
                 '96, Lyngby, Denmark, August 18--21, 1996:
                 proceedings}",
  title =        "{Applied parallel computing: industrial computation
                 and optimization: Third International Workshop, PARA
                 '96, Lyngby, Denmark, August 18--21, 1996:
                 proceedings}",
  volume =       "1184",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xiii + 722",
  year =         "1996",
  ISBN =         "3-540-62095-8",
  ISBN-13 =      "978-3-540-62095-2",
  LCCN =         "QA76.58 .P35 1996",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  keywords =     "parallel processing (electronic computers) --
                 congresses",
}

@Proceedings{Yetongnon:1996:PII,
  editor =       "K. Yetongnon and S. Hariri",
  booktitle =    "{Proceedings of the ISCA International Conference.
                 Parallel and Distributed Computing Systems: Dijon,
                 France, 25--27 September 1996 (PDCS '96: 9th)}",
  title =        "{Proceedings of the ISCA International Conference.
                 Parallel and Distributed Computing Systems: Dijon,
                 France, 25--27 September 1996 (PDCS '96: 9th)}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "1996",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Apr 16 14:20:56 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Zaky:1996:PDT,
  editor =       "Amr Zaky and Ted Lewis",
  booktitle =    "Tools and environments for parallel and distributed
                 systems",
  title =        "{Program development tools and environments for
                 parallel and distributed systems: Session; 28th Hawaii
                 international conference on system sciences --- 1995}",
  volume =       "2",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "viii + 305",
  year =         "1996",
  ISBN =         "0-7923-9675-8",
  ISBN-13 =      "978-0-7923-9675-8",
  LCCN =         "QA76.58.T65 1996",
  bibdate =      "Wed Aug 14 09:02:28 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "Kluwer International Series in Software Engineering",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1997:PPS,
  editor =       "{ACM}",
  booktitle =    "{PASCO '97. Proceedings of the second international
                 symposium on parallel symbolic computation, July
                 20--22, 1997, Maui, HI}",
  title =        "{PASCO '97. Proceedings of the second international
                 symposium on parallel symbolic computation, July
                 20--22, 1997, Maui, HI}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "1997",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Mar 12 07:30:53 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  xxnote =       "Check editor; proceedings not yet in LC, UC-Melvyl, or
                 OCLC??",
}

@Proceedings{ACM:1997:SHP,
  editor =       "{ACM}",
  booktitle =    "{SC'97: High Performance Networking and Computing:
                 Proceedings of the 1997 ACM\slash IEEE SC97 Conference:
                 November 15--21, 1997, San Jose, California, USA}",
  title =        "{SC'97: High Performance Networking and Computing:
                 Proceedings of the 1997 ACM\slash IEEE SC97 Conference:
                 November 15--21, 1997, San Jose, California, USA}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "vii + 159",
  year =         "1997",
  ISBN =         "0-89791-985-8",
  ISBN-13 =      "978-0-89791-985-2",
  LCCN =         "QA76.9.A25 A265 1997",
  bibdate =      "Sat Mar 21 09:10:00 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM SIGARCH order number 415972. IEEE Computer Society
                 Press order number RS00160.",
  URL =          "http://www.acm.org/pubs/contents/proceedings/commsec/266741/;
                 http://www.supercomp.org/sc97/proceedings/",
  acknowledgement = ack-nhfb,
  xxnote =       "Check ISBN: UC/Melvyl has this one for ``Proceedings /
                 Second ACM Workshop on Role-Based Access Control,
                 Fairfax, Virginia, USA, November 6--7, 1997''.",
}

@Proceedings{Boisvert:1997:QNS,
  editor =       "R. F. Boisvert",
  booktitle =    "{Quality of numerical software: assessment and
                 enhancement / proceedings of the IFIP TC2/WG2.5 Working
                 Conference on the Quality of Numerical Software,
                 Assessment and Enhancement, Oxford, United Kingdom,
                 8--12 July 1996}",
  title =        "{Quality of numerical software: assessment and
                 enhancement / proceedings of the IFIP TC2/WG2.5 Working
                 Conference on the Quality of Numerical Software,
                 Assessment and Enhancement, Oxford, United Kingdom,
                 8--12 July 1996}",
  publisher =    pub-CHAPMAN-HALL,
  address =      pub-CHAPMAN-HALL:adr,
  pages =        "vii + 384",
  year =         "1997",
  ISBN =         "0-412-80530-8",
  ISBN-13 =      "978-0-412-80530-1",
  LCCN =         "QA297 .I35 1996",
  bibdate =      "Thu Sep 16 09:48:36 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "IFIP; Technical Committee 2/Working Group 2.5.",
}

@Proceedings{Bubak:1997:RAP,
  editor =       "Marian Bubak and J. J. Dongarra and Jerzy Wasniewski",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 4th European PVM\slash MPI
                 user's group meeting Cracow, Poland, November 3--5,
                 1997: proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 4th European PVM\slash MPI
                 user's group meeting Cracow, Poland, November 3--5,
                 1997: proceedings}",
  volume =       "1332",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xv + 518",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISBN =         "3-540-63697-8 (paperback)",
  ISBN-13 =      "978-3-540-63697-7 (paperback)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58.E973 1997",
  bibdate =      "Mon Nov 24 09:49:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "Computer networks -- Congresses.; Parallel computers
                 -- Congresses.",
}

@Proceedings{IEEE:1997:APD,
  editor =       "{IEEE}",
  booktitle =    "{Advances in parallel and distributed computing: March
                 19--21, 1997, Shanghai, China: proceedings}",
  title =        "{Advances in parallel and distributed computing: March
                 19--21, 1997, Shanghai, China: proceedings}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 426",
  year =         "1997",
  ISBN =         "0-8186-7876-3 (paperback and case), 0-8186-7878-X
                 (microfiche)",
  ISBN-13 =      "978-0-8186-7876-9 (paperback and case),
                 978-0-8186-7878-3 (microfiche)",
  LCCN =         "QA76.58 .A4 1997",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  keywords =     "electronic data processing -- distributed processing
                 -- congresses; parallel processing (electronic
                 computers) -- congresses",
}

@Proceedings{IEEE:1997:PIP,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings. 11th International Parallel Processing
                 Symposium, April 1--5, 1997, Geneva, Switzerland}",
  title =        "{Proceedings. 11th International Parallel Processing
                 Symposium, April 1--5, 1997, Geneva, Switzerland}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxi + 765",
  year =         "1997",
  ISBN =         "0-8186-7793-7",
  ISBN-13 =      "978-0-8186-7793-9",
  LCCN =         "QA76.58 .I56 1997",
  bibdate =      "Thu May 21 19:02:04 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 97TB100107. IEEE Computer Society
                 Press order number PR07792",
  acknowledgement = ack-nhfb,
  conftitle =    "Proceedings 11th International Parallel Processing
                 Symposium",
  corpsource =   "Dept. of Comput. Sci., Utah Univ., Salt Lake City, UT,
                 USA",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Parallel
                 Process.; ACM SIGARCH; Eur. Assoc. Theor. Comput. Sci.
                 (EATCS); Swiss Special Interest Group on Parallelism
                 (SIPAR); SPPEDUP Soc",
  treatment =    "P Practical",
}

@Proceedings{IEEE:1997:TIS,
  editor =       "{IEEE}",
  booktitle =    "{Third International Symposium on High-Performance
                 Computer Architecture: proceedings, February 1--5,
                 1997, San Antonio, Texas}",
  title =        "{Third International Symposium on High-Performance
                 Computer Architecture: proceedings, February 1--5,
                 1997, San Antonio, Texas}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 353",
  year =         "1997",
  ISBN =         "0-8186-7764-3",
  ISBN-13 =      "978-0-8186-7764-9",
  LCCN =         "QA76.9.A73I566 1997",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE catalog number 97TB100094.",
  acknowledgement = ack-nhfb,
  corpsource =   "Hong Kong Univ., Hong Kong",
  sponsororg =   "IEEE Computer. Soc. Tech. Committee on Comput.
                 Archit",
  treatment =    "P Practical",
}

@Proceedings{ACM:1998:AWJ,
  editor =       "{ACM}",
  booktitle =    "{ACM 1998 Workshop on Java for High-Performance
                 Network Computing}",
  title =        "{ACM 1998 Workshop on Java for High-Performance
                 Network Computing}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "1998",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Apr 27 10:40:59 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Possibly unpublished, except electronically.",
  URL =          "http://www.cs.ucsb.edu/conferences/java98/program.html",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1998:SHP,
  editor =       "{ACM}",
  booktitle =    "{SC'98: High Performance Networking and Computing:
                 Proceedings of the 1998 ACM\slash IEEE SC98 Conference:
                 Orange County Convention Center, Orlando, Florida, USA,
                 November 7--13, 1998}",
  title =        "{SC'98: High Performance Networking and Computing:
                 Proceedings of the 1998 ACM\slash IEEE SC98 Conference:
                 Orange County Convention Center, Orlando, Florida, USA,
                 November 7--13, 1998}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1998",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Oct 07 08:51:34 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}

@Proceedings{Alexandrov:1998:RAP,
  editor =       "Vassil Alexandrov and J. J. Dongarra",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 5th European PVM\slash MPI
                 User's Group Meeting, Liverpool, UK, September 7--9,
                 1998: proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 5th European PVM\slash MPI
                 User's Group Meeting, Liverpool, UK, September 7--9,
                 1998: proceedings}",
  volume =       "1497",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xii + 412",
  year =         "1998",
  ISBN =         "3-540-65041-5 (softcover)",
  ISBN-13 =      "978-3-540-65041-6 (softcover)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA267.A1 L43 no.1497",
  bibdate =      "Mon May 3 11:00:13 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Jointly sponsored by the Computer Science Dept.,
                 University of Liverpool and Oak Ridge National
                 Laboratory.",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "data transmission systems -- congresses; parallel
                 computers -- congresses; virtual computer systems --
                 congresses",
}

@Proceedings{ACM:1999:SPO,
  editor =       "{ACM}",
  booktitle =    "{SC'99: Oregon Convention Center 777 NE Martin Luther
                 King Jr. Boulevard, Portland, Oregon, November 11--18,
                 1999}",
  title =        "{SC'99: Oregon Convention Center 777 NE Martin Luther
                 King Jr. Boulevard, Portland, Oregon, November 11--18,
                 1999}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1999",
  ISBN =         "",
  ISBN-13 =      "",
  LCCN =         "",
  bibdate =      "Thu Feb 24 09:35:00 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Dongarra:1999:RAP,
  editor =       "J. J. Dongarra and E. Luque and Tomas Margalef",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 6th European PVM\slash MPI
                 Users' Group Meeting, Barcelona, Spain, September
                 26--29, 1999: proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 6th European PVM\slash MPI
                 Users' Group Meeting, Barcelona, Spain, September
                 26--29, 1999: proceedings}",
  volume =       "1697",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xvii + 551",
  year =         "1999",
  ISBN =         "3-540-66549-8 (softcover)",
  ISBN-13 =      "978-3-540-66549-6 (softcover)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 E973 1999",
  bibdate =      "Wed Dec 8 06:34:56 MST 1999",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  alttitle =     "PVM/MPI '99",
  keywords =     "Data transmission systems; Parallel computers; Virtual
                 computer systems",
}

@Proceedings{ACM:2000:SHP,
  editor =       "{ACM}",
  booktitle =    "{SC2000: High Performance Networking and Computing.
                 Dallas Convention Center, Dallas, TX, USA, November
                 4--10, 2000}",
  title =        "{SC2000: High Performance Networking and Computing.
                 Dallas Convention Center, Dallas, TX, USA, November
                 4--10, 2000}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2000",
  ISBN =         "",
  ISBN-13 =      "",
  LCCN =         "",
  bibdate =      "Thu Feb 24 09:35:00 2000",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sc2000.org/proceedings/info/fp.pdf",
  acknowledgement = ack-nhfb,
}

@Proceedings{Dongarra:2000:RAP,
  editor =       "J. J. Dongarra and Peter Kacsuk and Norbert
                 Podhorszki",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 7th European PVM\slash MPI
                 Users' Group Meeting, Balatonfured, Hungary, September
                 10--13, 2000: proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 7th European PVM\slash MPI
                 Users' Group Meeting, Balatonfured, Hungary, September
                 10--13, 2000: proceedings}",
  volume =       "1908",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xv + 364",
  year =         "2000",
  ISBN =         "3-540-41010-4 (softcover)",
  ISBN-13 =      "978-3-540-41010-2 (softcover)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  bibdate =      "Mon Oct 16 18:31:56 MDT 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "data transmission systems -- congresses; parallel
                 computers -- congresses; virtual computer systems --
                 congresses",
}

@Proceedings{Engquist:2000:SVG,
  editor =       "Bj{\"o}rn Engquist",
  booktitle =    "{Simulation and visualization on the grid:
                 Parallelldatorcentrum, Kungl. Tekniska H{\"o}gskolan,
                 seventh annual conference, Stockholm, Sweden, December
                 1999: proceedings}",
  title =        "{Simulation and visualization on the grid:
                 Parallelldatorcentrum, Kungl. Tekniska H{\"o}gskolan,
                 seventh annual conference, Stockholm, Sweden, December
                 1999: proceedings}",
  volume =       "13",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xiii + 300",
  year =         "2000",
  ISBN =         "3-540-67264-8",
  ISBN-13 =      "978-3-540-67264-7",
  ISSN =         "1439-7358",
  LCCN =         "QA76.9.C65 S535 2000",
  bibdate =      "Wed Oct 18 10:32:22 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCSE,
  acknowledgement = ack-nhfb,
  keywords =     "Computer simulation -- Congresses. Visualization --
                 Congresses",
}

@Book{Koniges:2000:ISP,
  editor =       "Alice E. Koniges",
  booktitle =    "{Industrial Strength Parallel Computing}",
  title =        "{Industrial Strength Parallel Computing}",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xxv + 597",
  year =         "2000",
  ISBN =         "1-55860-540-1",
  ISBN-13 =      "978-1-55860-540-4",
  LCCN =         "QA76.58 .I483 2000",
  bibdate =      "Fri Feb 04 18:30:40 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Reynders:2000:IPI,
  editor =       "John Reynders and Alexander V. Veidenbaum",
  booktitle =    "{ICS '00: Proceedings of the 14th international
                 conference on Supercomputing: Santa Fe, New Mexico,
                 USA, May 8--11, 2000}",
  title =        "{ICS '00: Proceedings of the 14th international
                 conference on Supercomputing: Santa Fe, New Mexico,
                 USA, May 8--11, 2000}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  bookpages =    "xi + 509",
  pages =        "xi + 509",
  year =         "2000",
  DOI =          "https://doi.org/10.1145/335231",
  ISBN =         "1-58113-270-0",
  ISBN-13 =      "978-1-58113-270-0",
  LCCN =         "QA76.88 .I573 2000",
  bibdate =      "Fri Jul 27 05:22:06 2001",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/supercomputing/335231/;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/doi/proceedings/10.1145/335231",
  acknowledgement = ack-nhfb,
  keywords =     "AS/400; ESA/390; IA-64; Java Virtual Machine (JVM);
                 RS/6000",
}

@Proceedings{USENIX:2000:PAL,
  editor =       "{USENIX}",
  booktitle =    "{Proceedings of the 4th Annual Linux Showcase and
                 Conference, Atlanta, October 10--14, 2000, Atlanta,
                 Georgia, USA}",
  title =        "{Proceedings of the 4th Annual Linux Showcase and
                 Conference, Atlanta, October 10--14, 2000, Atlanta,
                 Georgia, USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "394",
  year =         "2000",
  ISBN =         "1-880446-17-0",
  ISBN-13 =      "978-1-880446-17-1",
  LCCN =         "????",
  bibdate =      "Wed Oct 16 06:06:36 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/als2000/",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2001:SHP,
  editor =       "{ACM}",
  booktitle =    "{SC2001: High Performance Networking and Computing.
                 Denver, CO, November 10--16, 2001}",
  title =        "{SC2001: High Performance Networking and Computing.
                 Denver, CO, November 10--16, 2001}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2001",
  ISBN =         "1-58113-293-X",
  ISBN-13 =      "978-1-58113-293-9",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Cotronis:2001:RAP,
  editor =       "Yiannis Cotronis and J. J. Dongarra",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 8th European PVM\slash MPI
                 Users' Group Meeting, Santorini\slash Thera, Greece,
                 September 23--26, 2001: proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 8th European PVM\slash MPI
                 Users' Group Meeting, Santorini\slash Thera, Greece,
                 September 23--26, 2001: proceedings}",
  volume =       "2131",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xv + 438",
  year =         "2001",
  ISBN =         "3-540-42609-4 (paperback)",
  ISBN-13 =      "978-3-540-42609-7 (paperback)",
  LCCN =         "QA76.58 E975 2001; QA267.A1 L43 no.2131",
  bibdate =      "Thu Jan 17 11:49:19 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS # " and " # ser-LNAI,
  URL =          "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm",
  acknowledgement = ack-nhfb,
  keywords =     "data transmission systems -- congresses; parallel
                 computers -- congresses; virtual computer systems --
                 congresses",
}

@Proceedings{Eigenmann:2001:OSM,
  editor =       "Rudolf Eigenmann and Michael J. Voss",
  booktitle =    "{OpenMP shared memory parallel programming:
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2001, West Lafayette, IN, USA, July
                 30--31, 2001: Proceedings}",
  title =        "{OpenMP shared memory parallel programming:
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2001, West Lafayette, IN, USA, July
                 30--31, 2001: Proceedings}",
  volume =       "2104",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 184",
  year =         "2001",
  CODEN =        "LNCSD9",
  DOI =          "????",
  ISBN =         "3-540-42346-X (paperback)",
  ISBN-13 =      "978-3-540-42346-1 (paperback)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.642 .I589 2001; QA267.A1 L43 no.2104",
  bibdate =      "Thu Jan 17 11:49:19 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm",
  acknowledgement = ack-nhfb,
  keywords =     "parallel programming (computer science) --
                 congresses",
}

@Proceedings{IEEE:2002:STI,
  editor =       "{IEEE}",
  booktitle =    "{SC2002: From Terabytes to Insight. Proceedings of the
                 IEEE ACM SC 2002 Conference, November 16--22, 2002,
                 Baltimore, MD, USA}",
  title =        "{SC2002: From Terabytes to Insight. Proceedings of the
                 IEEE ACM SC 2002 Conference, November 16--22, 2002,
                 Baltimore, MD, USA}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "2002",
  ISBN =         "0-7695-1524-X",
  ISBN-13 =      "978-0-7695-1524-3",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Kranzlmuller:2002:RAP,
  editor =       "Dieter Kranzlmuller",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 9th European PVM\slash MPI
                 Users' Group Meeting, Linz, Austria, September
                 29--October 2, 2002: proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 9th European PVM\slash MPI
                 Users' Group Meeting, Linz, Austria, September
                 29--October 2, 2002: proceedings}",
  volume =       "2474",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xvi + 462",
  year =         "2002",
  ISBN =         "3-540-44296-0 (softcover)",
  ISBN-13 =      "978-3-540-44296-7 (softcover)",
  LCCN =         "QA76.58 .E975 2002",
  bibdate =      "Sun Dec 1 08:06:09 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Also available via the World Wide Web",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "data transmission systems --- congresses; parallel
                 computers -- congresses; virtual computer systems --
                 congresses",
}

@Proceedings{Oldehoeft:2002:SIS,
  editor =       "Rod Oldehoeft",
  booktitle =    "{Special issue on software for high-performance
                 systems: papers from the symposium of the Los Alamos
                 Computer Science Institute, held in Santa Fe, NM, USA
                 on October 15--18, 2001}",
  title =        "{Special issue on software for high-performance
                 systems: papers from the symposium of the Los Alamos
                 Computer Science Institute, held in Santa Fe, NM, USA
                 on October 15--18, 2001}",
  volume =       "23(1)",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "128",
  year =         "2002",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jan 14 07:13:03 2004",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "The journal of supercomputing",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2003:SII,
  editor =       "{ACM}",
  booktitle =    "{SC2003: Igniting Innovation. Phoenix, AZ, November
                 15--21, 2003}",
  title =        "{SC2003: Igniting Innovation. Phoenix, AZ, November
                 15--21, 2003}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2003",
  ISBN =         "1-58113-695-1",
  ISBN-13 =      "978-1-58113-695-1",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Dongarra:2003:RAP,
  editor =       "Jack Dongarra and Domenico Laforenza and Salvatore
                 Orlando",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 10th European PVM\slash MPI
                 User's group Meeting, Venice, Italy, September
                 29--October 2, 2003: Proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 10th European PVM\slash MPI
                 User's group Meeting, Venice, Italy, September
                 29--October 2, 2003: Proceedings}",
  volume =       "2840",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xviii + 693",
  year =         "2003",
  CODEN =        "LNCSD9",
  ISBN =         "3-540-20149-1",
  ISBN-13 =      "978-3-540-20149-6",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .E973 2003",
  bibdate =      "Tue Jan 13 19:17:43 2004",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://link.springer-ny.com/link/service/series/0558/tocs/t2840.htm",
  acknowledgement = ack-nhfb,
}

@Proceedings{Voss:2003:OSM,
  editor =       "Michael J. Voss",
  booktitle =    "{OpenMP shared memory parallel programming:
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2003, Toronto, Canada, June 26--27, 2003:
                 Proceedings}",
  title =        "{OpenMP shared memory parallel programming:
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2003, Toronto, Canada, June 26--27, 2003:
                 Proceedings}",
  volume =       "2716",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "viii + 270",
  year =         "2003",
  CODEN =        "LNCSD9",
  DOI =          "????",
  ISBN =         "3-540-40435-X (softcover)",
  ISBN-13 =      "978-3-540-40435-4 (softcover)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.642 .I589 2003",
  bibdate =      "Thu Aug 21 09:09:03 MDT 2003",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://link.springer-ny.com/link/service/series/0558/tocs/t2716.htm;
                 http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=2716",
  acknowledgement = ack-nhfb,
  keywords =     "parallel programming (computer science) ---
                 congresses",
}

@Proceedings{ACM:2004:SHP,
  editor =       "{ACM}",
  booktitle =    "{SC 2004: High Performance Computing, Networking and
                 Storage: Bridging communities: Proceedings of the
                 IEEE\slash ACM Supercomputing 2004 Conference,
                 Pittsburgh, PA, November 6--12, 2004}",
  title =        "{SC 2004: High Performance Computing, Networking and
                 Storage: Bridging communities: Proceedings of the
                 IEEE\slash ACM Supercomputing 2004 Conference,
                 Pittsburgh, PA, November 6--12, 2004}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2004",
  ISBN =         "0-7695-2153-3",
  ISBN-13 =      "978-0-7695-2153-4",
  LCCN =         "????",
  bibdate =      "Tue Dec 27 08:08:01 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Kranzlmuller:2004:RAP,
  editor =       "Dieter Kranzlm{\"u}ller and P{\'e}ter Kacsuk and Jack
                 J. Dongarra",
  booktitle =    "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 11th European PVM/MPI Users'
                 Group Meeting, Budapest, Hungary, September 19--22,
                 2004: proceedings}",
  title =        "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 11th European PVM/MPI Users'
                 Group Meeting, Budapest, Hungary, September 19--22,
                 2004: proceedings}",
  volume =       "3241",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xiii + 452",
  year =         "2004",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/b100820",
  ISBN =         "3-540-23163-3",
  ISBN-13 =      "978-3-540-23163-9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .E973 2004",
  bibdate =      "Sat Jun 4 05:55:05 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=3241;
                 http://www.springerlink.com/openurl.asp?genre=volume&id=doi:10.1007/b100820",
  acknowledgement = ack-nhfb,
  meetingname =  "European PVM/MPI Users' Group Meeting (11th: 2004:
                 Budapest, Hungary)",
  subject =      "Parallel computers; Congresses; Virtual computer
                 systems; Congresses; Data transmission systems;
                 Congresses",
}

@Proceedings{ACM:2005:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 2005 ACM\slash IEEE conference on
                 Supercomputing 2005, Seattle, WA, November 12--18
                 2005}",
  title =        "{Proceedings of the 2005 ACM\slash IEEE conference on
                 Supercomputing 2005, Seattle, WA, November 12--18
                 2005}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2005",
  ISBN =         "1-59593-061-2",
  ISBN-13 =      "978-1-59593-061-3",
  LCCN =         "????",
  bibdate =      "Tue Dec 27 08:08:01 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Beyer:2005:GEC,
  editor =       "Hans-Georg Beyer and others",
  booktitle =    "{Genetic and Evolutionary Computation Conference:
                 GECCO 2005, June 25--29, 2005 (Saturday-Wednesday)
                 Washington, DC, USA}",
  title =        "{Genetic and Evolutionary Computation Conference:
                 GECCO 2005, June 25--29, 2005 (Saturday-Wednesday)
                 Washington, DC, USA}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "2005",
  ISBN =         "1-59593-010-8 (paperback)",
  ISBN-13 =      "978-1-59593-010-1 (paperback)",
  LCCN =         "QA76.623 .G44 2005",
  bibdate =      "Tue Mar 6 06:24:38 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  note =         "ACM order number 910050.",
  acknowledgement = ack-nhfb,
  subject =      "Genetic algorithms; Data processing; Congresses;
                 Parallel processing (Electronic computers)",
}

@Proceedings{Chapman:2005:SMP,
  editor =       "Barbara M. Chapman",
  booktitle =    "{Shared memory parallel programming with OpenMP: 5th
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004:
                 Revised selected papers}",
  title =        "{Shared memory parallel programming with OpenMP: 5th
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004:
                 Revised selected papers}",
  volume =       "3349",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 147",
  year =         "2005",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/b105895",
  ISBN =         "3-540-24560-X",
  ISBN-13 =      "978-3-540-24560-5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76 .A1 L42 NO.3349",
  bibdate =      "Thu Jun 2 07:26:02 MDT 2005",
  bibsource =    "clavis.ucalgary.ca:2200/UNICORN;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=3349;
                 http://www.springerlink.com/openurl.asp?genre=volume&id=doi:10.1007/b105895",
  acknowledgement = ack-nhfb,
  meetingname =  "International Workshop on OpenMP Applications and
                 Tools (2004: Houston, Tex.)",
  subject =      "Parallel programming (Computer science); Congresses",
}

@Proceedings{DiMartino:2005:RAP,
  editor =       "Beniamino {Di Martino} and Dieter Kranzlm{\"u}ller and
                 J. J. Dongarra",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 12th European PVM/MPI User's
                 Group Meeting, Sorrento, Italy, September 18--21, 2005:
                 proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 12th European PVM/MPI User's
                 Group Meeting, Sorrento, Italy, September 18--21, 2005:
                 proceedings}",
  volume =       "3666",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xvii + 546",
  year =         "2005",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/11557265",
  ISBN =         "3-540-29009-5 (paperback)",
  ISBN-13 =      "978-3-540-29009-4 (paperback)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .E973 2005",
  bibdate =      "Wed Apr 5 19:31:25 MDT 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       ser-LNCS,
  URL =          "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0302-9743&volume=3666",
  acknowledgement = ack-nhfb,
  meetingname =  "European PVM/MPI Users' Group Meeting (12th: 2005:
                 Sorrento, Italy)",
  subject =      "Parallel computers; Congresses; Virtual computer
                 systems; Data transmission systems",
}

@Proceedings{IEEE:2005:IPD,
  editor =       "{IEEE}",
  booktitle =    "{19th International Parallel and Distributed
                 Processing Symposium: proceedings: April 4--8, 2005,
                 Denver, Colorado}",
  title =        "{19th International Parallel and Distributed
                 Processing Symposium: proceedings: April 4--8, 2005,
                 Denver, Colorado}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "lv + 311",
  year =         "2005",
  ISBN =         "0-7695-2312-9",
  ISBN-13 =      "978-0-7695-2312-5",
  LCCN =         "????",
  bibdate =      "Fri May 27 14:11:22 2005",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "IEEE Computer Society Order Number P2312.",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2006:PCC,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 3rd conference on Computing
                 Frontiers, May 3--5, 2006, Ischia, Italy}",
  title =        "{Proceedings of the 3rd conference on Computing
                 Frontiers, May 3--5, 2006, Ischia, Italy}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  year =         "2006",
  ISBN =         "1-59593-302-6",
  ISBN-13 =      "978-1-59593-302-7",
  LCCN =         "",
  bibdate =      "Tue Jun 20 06:45:04 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM order number 104060.",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2006:PST,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 37th SIGCSE technical symposium on
                 Computer science education 2006, Houston, Texas, USA,
                 March 03--05, 2006}",
  title =        "{Proceedings of the 37th SIGCSE technical symposium on
                 Computer science education 2006, Houston, Texas, USA,
                 March 03--05, 2006}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "2006",
  ISBN =         "1-59593-259-3",
  ISBN-13 =      "978-1-59593-259-4",
  LCCN =         "",
  bibdate =      "Tue Jun 20 06:53:22 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "ACM order number 457060.",
  acknowledgement = ack-nhfb,
}

@Proceedings{Mohr:2006:RAP,
  editor =       "Bernd Mohr and Jesper Larsson Tr{\"a}ff and Joachim
                 Worringen and Jack Dongarra",
  booktitle =    "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 13th European PVM\slash MPI
                 User's Group Meeting Bonn, Germany, September 17--20,
                 2006 Proceedings}",
  title =        "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 13th European PVM\slash MPI
                 User's Group Meeting Bonn, Germany, September 17--20,
                 2006 Proceedings}",
  volume =       "4192",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "104 (est.)",
  year =         "2006",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/11846802",
  ISBN =         "3-540-39110-X (print), 3-540-39112-6 (e-book)",
  ISBN-13 =      "978-3-540-39110-4 (print), 978-3-540-39112-8
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:21:40 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-540-39112-8",
  acknowledgement = ack-nhfb,
}

@Proceedings{Cappello:2007:RAP,
  editor =       "Franck Cappello and Thomas Herault and Jack Dongarra",
  booktitle =    "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 14th European PVM\slash MPI
                 User's Group Meeting, Paris, France, September 30 ---
                 October 3, 2007. Proceedings}",
  title =        "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 14th European PVM\slash MPI
                 User's Group Meeting, Paris, France, September 30 ---
                 October 3, 2007. Proceedings}",
  volume =       "4757",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "116 (est.)",
  year =         "2007",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-540-75416-9",
  ISBN =         "3-540-75415-6 (print), 3-540-75416-4 (e-book)",
  ISBN-13 =      "978-3-540-75415-2 (print), 978-3-540-75416-9
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:25:09 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-540-75416-9",
  acknowledgement = ack-nhfb,
}

@Proceedings{Simos:2007:CMS,
  editor =       "Theodore E. Simos and George Maroulis",
  booktitle =    "{Computation in Modern Science and Engineering:
                 Proceedings of the [Fifth] International Conference on
                 Computational Methods in Science and Engineering 2007
                 (ICCMSE 2007), Corfu, Greece, 25--30 September 2007}",
  title =        "{Computation in Modern Science and Engineering:
                 Proceedings of the [Fifth] International Conference on
                 Computational Methods in Science and Engineering 2007
                 (ICCMSE 2007), Corfu, Greece, 25--30 September 2007}",
  volume =       "2A, 2B",
  publisher =    pub-AIP,
  address =      pub-AIP:adr,
  bookpages =    "xxvi + 730 + 10 (vol. 2A)",
  pages =        "xxvi + 730 + 10 (vol. 2A)",
  year =         "2007",
  ISBN =         "0-7354-0476-3 (set), 0-7354-0477-1 (vol. 1),
                 0-7354-0478-X (vol. 2)",
  ISBN-13 =      "978-0-7354-0476-2 (set), 978-0-7354-0477-9 (vol. 1),
                 978-0-7354-0478-6 (vol. 2)",
  ISSN =         "0094-243X (print), 1551-7616 (electronic), 1935-0465",
  LCCN =         "Q183.9 .I524 2007",
  bibdate =      "Thu Feb 21 14:15:15 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       "AIP Conference Proceedings (\#963)",
  URL =          "http://www.springer.com/physics/atoms/book/978-0-7354-0478-6",
  abstract =     "All papers have been peer-reviewed. The aim of ICCMSE
                 2007 is to bring together computational scientists and
                 engineers from several disciplines in order to share
                 methods, methodologies and ideas. The potential readers
                 of these proceedings are all the scientists with
                 interest in the following fields: Computational
                 Mathematics, Theoretical Physics, Computational
                 Physics, Theoretical Chemistry, Computational
                 Chemistry, Mathematical Chemistry, Computational
                 Engineering, Computational Mechanics, Computational
                 Biology and Medicine, Scientific Computation, High
                 Performance Computing, Parallel and Distributed
                 Computing, Visualization, Problem Solving Environments,
                 Software Tools, Advanced Numerical Algorithms, Modeling
                 and Simulation of Complex Systems, Web-based Simulation
                 and Computing, Grid-based Simulation and Computing,
                 Computational Grids, and Computer Science.",
  acknowledgement = ack-nhfb,
  remark =       "Two volumes.",
}

@Proceedings{Bischof:2008:AAD,
  editor =       "Christian H. Bischof and H. Martin B{\"u}cker and Paul
                 Hovland and Uwe Naumann and Jean Utke",
  booktitle =    "Advances in Automatic Differentiation",
  title =        "Advances in Automatic Differentiation",
  volume =       "64",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  bookpages =    "xviii + 362",
  pages =        "xviii + 362",
  year =         "2008",
  CODEN =        "LNCSA6",
  DOI =          "https://doi.org/10.1007/978-3-540-68942-3",
  ISBN =         "3-540-68935-4 (print), 3-540-68942-7 (e-book)",
  ISBN-13 =      "978-3-540-68935-5 (print), 978-3-540-68942-3
                 (e-book)",
  ISSN =         "1439-7358",
  ISSN-L =       "1439-7358",
  LCCN =         "QA304 .I58 2008",
  bibdate =      "Thu Dec 20 14:35:07 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCSE,
  URL =          "http://link.springer.com/book/10.1007/978-3-540-68942-3;
                 http://www.springerlink.com/content/978-3-540-68942-3",
  acknowledgement = ack-nhfb,
  remark =       "The Fifth International Conference on Automatic
                 Differentiation held from August 11 to 15, 2008 in
                 Bonn, Germany, is the most recent one in a series that
                 began in Breckenridge, USA, in 1991 and continued in
                 Santa Fe, USA, in 1996, Nice, France, in 2000 and
                 Chicago, USA, in 2004.",
  series-URL =   "http://link.springer.com/bookseries/3527",
}

@Proceedings{Chapman:2008:PPM,
  editor =       "Barbara Chapman and Weiming Zheng and Guang R. Gao and
                 Mitsuhisa Sato and Eduard Ayguad{\'e} and Dongsheng
                 Wang",
  booktitle =    "{A Practical Programming Model for the Multi-Core Era:
                 3rd International Workshop on OpenMP, IWOMP 2007,
                 Beijing, China, June 3--7, 2007 Proceedings}",
  title =        "{A Practical Programming Model for the Multi-Core Era:
                 3rd International Workshop on OpenMP, IWOMP 2007,
                 Beijing, China, June 3--7, 2007 Proceedings}",
  volume =       "4935",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "184 (est.)",
  year =         "2008",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-540-69303-1",
  ISBN =         "3-540-69302-5 (print), 3-540-69303-3 (e-book)",
  ISBN-13 =      "978-3-540-69302-4 (print), 978-3-540-69303-1
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:20:29 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-540-69303-1",
  acknowledgement = ack-nhfb,
}

@Proceedings{Eigenmann:2008:ONE,
  editor =       "Rudolf Eigenmann and Bronis R. de Supinski",
  booktitle =    "{OpenMP in a New Era of Parallelism: 4th International
                 Workshop, IWOMP 2008 West Lafayette, IN, USA, May
                 12--14, 2008 Proceedings}",
  title =        "{OpenMP in a New Era of Parallelism: 4th International
                 Workshop, IWOMP 2008 West Lafayette, IN, USA, May
                 12--14, 2008 Proceedings}",
  volume =       "5004",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "190 (est.)",
  year =         "2008",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-540-79561-2",
  ISBN =         "3-540-79560-X (print), 3-540-79561-8 (e-book)",
  ISBN-13 =      "978-3-540-79560-5 (print), 978-3-540-79561-2
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:21:59 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-540-79561-2",
  acknowledgement = ack-nhfb,
}

@Proceedings{Lastovetsky:2008:RAP,
  editor =       "Alexey Lastovetsky and Tahar Kechadi and Jack
                 Dongarra",
  booktitle =    "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 15th European PVM\slash MPI
                 Users' Group Meeting, Dublin, Ireland, September 7--10,
                 2008. Proceedings}",
  title =        "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 15th European PVM\slash MPI
                 Users' Group Meeting, Dublin, Ireland, September 7--10,
                 2008. Proceedings}",
  volume =       "5205",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "129 (est.)",
  year =         "2008",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-540-87475-1",
  ISBN =         "3-540-87474-7 (print), 3-540-87475-5 (e-book)",
  ISBN-13 =      "978-3-540-87474-4 (print), 978-3-540-87475-1
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:17:37 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-540-87475-1",
  acknowledgement = ack-nhfb,
}

@Proceedings{Mueller:2008:OSM,
  editor =       "Matthias S. Mueller and Barbara M. Chapman and Bronis
                 R. de Supinski and Allen D. Malony and Michael Voss",
  booktitle =    "{OpenMP Shared Memory Parallel Programming:
                 International Workshops, IWOMP 2005 and IWOMP 2006,
                 Eugene, OR, USA, June 1--4, 2005, Reims, France, June
                 12--15, 2006. Proceedings}",
  title =        "{OpenMP Shared Memory Parallel Programming:
                 International Workshops, IWOMP 2005 and IWOMP 2006,
                 Eugene, OR, USA, June 1--4, 2005, Reims, France, June
                 12--15, 2006. Proceedings}",
  volume =       "4315",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "252 (est.)",
  year =         "2008",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-540-68555-5",
  ISBN =         "3-540-68554-5 (print), 3-540-68555-3 (e-book)",
  ISBN-13 =      "978-3-540-68554-8 (print), 978-3-540-68555-5
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:24:26 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-540-68555-5",
  acknowledgement = ack-nhfb,
}

@Book{Nguyen:2008:GG,
  editor =       "Hubert Nguyen",
  booktitle =    "{GPU} gems 3",
  title =        "{GPU} gems 3",
  volume =       "3",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "l + 942",
  year =         "2008",
  ISBN =         "0-321-51526-9",
  ISBN-13 =      "978-0-321-51526-1",
  LCCN =         "T385 .G6882 2008",
  bibdate =      "Thu Jul 29 13:36:54 MDT 2010",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "GPU gems",
  URL =          "http://www.loc.gov/catdir/toc/ecip0720/2007023985.html",
  acknowledgement = ack-nhfb,
  keywords =     "CUDA; nVIDIA",
  subject =      "Computer graphics; Real-time programming",
}

@Proceedings{Muller:2009:EOA,
  editor =       "Matthias S. M{\"u}ller and Bronis R. de Supinski and
                 Barbara M. Chapman",
  booktitle =    "{Evolving OpenMP in an Age of Extreme Parallelism: 5th
                 International Workshop on OpenMP, IWOMP 2009 Dresden,
                 Germany, June 3--5, 2009 Proceedings}",
  title =        "{Evolving OpenMP in an Age of Extreme Parallelism: 5th
                 International Workshop on OpenMP, IWOMP 2009 Dresden,
                 Germany, June 3--5, 2009 Proceedings}",
  volume =       "5568",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "182 (est.)",
  year =         "2009",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-02303-3",
  ISBN =         "3-642-02284-7 (print), 3-642-02303-7 (e-book)",
  ISBN-13 =      "978-3-642-02284-5 (print), 978-3-642-02303-3
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:25:20 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-02303-3",
  acknowledgement = ack-nhfb,
}

@Proceedings{Ropo:2009:RAP,
  editor =       "Matti Ropo and Jan Westerholm and Jack Dongarra",
  booktitle =    "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 16th European PVM\slash MPI
                 Users' Group Meeting, Espoo, Finland, September 7--10,
                 2009. Proceedings}",
  title =        "{Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface: 16th European PVM\slash MPI
                 Users' Group Meeting, Espoo, Finland, September 7--10,
                 2009. Proceedings}",
  volume =       "5759",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "142 (est.)",
  year =         "2009",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-03770-2",
  ISBN =         "3-642-03769-0 (print), 3-642-03770-4 (e-book)",
  ISBN-13 =      "978-3-642-03769-6 (print), 978-3-642-03770-2
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:20:58 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-03770-2",
  acknowledgement = ack-nhfb,
}

@Proceedings{Tuncer:2009:PCF,
  editor =       "Ismail H. Tuncer and {\"U}lgen G{\"u}lcat and David R.
                 Emerson and Kenichi Matsuno",
  booktitle =    "{Parallel Computational Fluid Dynamics 2007:
                 Implementations and Experiences on Large Scale and Grid
                 Computing}",
  title =        "{Parallel Computational Fluid Dynamics 2007:
                 Implementations and Experiences on Large Scale and Grid
                 Computing}",
  volume =       "67",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  bookpages =    "xi + 480",
  pages =        "xi + 480",
  year =         "2009",
  CODEN =        "LNCSA6",
  ISBN =         "3-540-92743-3 (print), 3-540-92744-1 (e-book)",
  ISBN-13 =      "978-3-540-92743-3 (print), 978-3-540-92744-0
                 (e-book)",
  ISSN =         "1439-7358",
  ISSN-L =       "1439-7358",
  LCCN =         "????",
  bibdate =      "Thu Dec 20 14:35:19 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Parallel CFD 2007 was held in Antalya, Turkey, from
                 May 21 to 24, 2007.",
  series =       ser-LNCSE,
  URL =          "http://link.springer.com/book/10.1007/978-3-540-92744-0;
                 http://www.springerlink.com/content/978-3-540-92744-0",
  acknowledgement = ack-nhfb,
  series-URL =   "http://link.springer.com/bookseries/3527",
  tableofcontents = "Numerical Simulation of a Spinning Projectile Using
                 Parallel and Vectorized Unstructured Flow Solver \\
                 Development of a Framework for Parallel Simulators with
                 Various Physics and its Performance \\
                 Experience in Parallel Computational Mechanics on
                 Marenostrum \\
                 New Approaches to Modeling Rarefied Gas Flow in the
                 Slip and Transition Regime \\
                 A Parallel Scientific Software for Heterogeneous
                 Hydrogeology \\
                 Aerodynamic Shape Optimization Methods on
                 Multiprocessor Platforms \\
                 Non-Sinusoidal Path Optimization of Dual Airfoils
                 Flapping in a Biplane Configuration Parallel
                 Computation of $3$-D Viscous Flows on Hybrid Grids
                 Implementation of Parallel DSMC Method to Adiabatic
                 Piston Problem \\
                 Efficient Parallel Algorithm for Multiconstrained
                 Optimization of Wing-Body Configurations \\
                 Parallel Three Dimensional Direct Simulation Monte
                 Carlo for Simulating Micro Flows \\
                 A Study on the Prediction of the Aerodynamic
                 Characteristics of an Orbital Block of a Launch Vehicle
                 in the Rarefied Flow Regime Using the DSMC Approach and
                 the Parallel Computation \\
                 Parallel Solution of a $3$-D Mixed Convection Problem
                 Computation of Hypersonic Flow of a Diatomic Gas in
                 Rotational Non-Equilibrium Past a Blunt Body Using the
                 Generalized Boltzmann Equation Application of Parallel
                 Processing to Numerical Modeling of Two-Phase
                 Deflagration-to-Detonation (DDT) Phenomenon \\
                 Highly Scalable Multiphysics Computational Framework
                 for Propulsive Energetic Systems \\
                 A Parallel Aitken-Additive Schwarz Waveform Relaxation
                 Method for Parabolic Problems \\
                 Parallel Computation of Incompressible Flows Driven by
                 Moving Multiple Obstacles Using a New Moving
                 Embedded-Grid Method \\
                 Parallel Computing on Network of Windows Based PCs
                 Parallel Computations of Droplet Oscillations Cyclic
                 Distribution of Pipelined Parallel Deferred Correction
                 Method for ODE/DAE \\
                 Hybrid Parallelization Techniques for Lattice Boltzmann
                 Free Surface Flows \\
                 Flow-Structure Interaction and Flow Analysis of
                 Hydraulic Machineron a Computational Grid \\
                 Parallel Computation of Incompressible Flow Using
                 Building-Cube Method \\
                 $3$D Model of Pollution Distribution in City Air and
                 its Parallel Realization \\
                 Parallel Navier-Stokes Solution of a Wing-Flap
                 Configuration on Structured Multi-Block Oversetting
                 Grids Parallel Navier-Stokes Solutions of NASA 65\?
                 Delta-Wing Parallel Turbulent Navier-Stokes Solutions
                 of Wing alone Geometries for Drag Prediction \\
                 Adaptive Aitken-Schwarz for Darcy $3$D Flow on
                 Heterogeneous Media \\
                 Numerical Simulation of Compressible Flow using
                 Three-Dimensional Unstructured Added/Eliminated Grid
                 Method \\
                 Technology of Parallelization for $2$D and $3$D CFD/CAA
                 Codes based on High-Accuracy Explicit Methods on
                 Unstructured Meshes \\
                 Separate Treatment of Momentum and Heat Flows in
                 Parallel Environment \\
                 DNS of Turbulent Natural Convection Flows on the Mare
                 Nostrum Supercomputer Termo Fluids: A New Parallel
                 Unstructured CFD Code for the Simulation of Turbulent
                 Industrial Problems on Low Cost PC Cluster",
}

@Proceedings{Chaudhuri:2010:PIC,
  editor =       "Pranay Chaudhuri and Sukumar Ghosh and Raj Kumar Buyya
                 and Jian-Nong Cao and Oeepak Oahiya",
  booktitle =    "{Proceedings of the 2010 1st International Conference
                 on Parallel Distributed and Grid Computing (PDGC),
                 Jaypee University of Information Technology Waknaghat,
                 Solan, HP, India, 28--30 October, 2010}",
  title =        "{Proceedings of the 2010 1st International Conference
                 on Parallel Distributed and Grid Computing (PDGC),
                 Jaypee University of Information Technology Waknaghat,
                 Solan, HP, India, 28--30 October, 2010}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  bookpages =    "xiii + 382",
  pages =        "xiii + 382",
  year =         "2010",
  ISBN =         "1-4244-7675-5",
  ISBN-13 =      "978-1-4244-7675-6",
  LCCN =         "????",
  bibdate =      "Thu Apr 21 10:51:00 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Keller:2010:RAM,
  editor =       "Rainer Keller and Edgar Gabriel and Michael Resch and
                 Jack Dongarra",
  booktitle =    "{Recent Advances in the Message Passing Interface:
                 17th European MPI Users' Group Meeting, EuroMPI 2010,
                 Stuttgart, Germany, September 12--15, 2010.
                 Proceedings}",
  title =        "{Recent Advances in the Message Passing Interface:
                 17th European MPI Users' Group Meeting, EuroMPI 2010,
                 Stuttgart, Germany, September 12--15, 2010.
                 Proceedings}",
  volume =       "6305",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "197 (est.)",
  year =         "2010",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-15646-5",
  ISBN =         "3-642-15645-2 (print), 3-642-15646-0 (e-book)",
  ISBN-13 =      "978-3-642-15645-8 (print), 978-3-642-15646-5
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:24:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-15646-5",
  acknowledgement = ack-nhfb,
}

@Proceedings{Sato:2010:BLL,
  editor =       "Mitsuhisa Sato and Toshihiro Hanawa and Matthias S.
                 M{\"u}ller and Barbara M. Chapman and Bronis R. de
                 Supinski",
  booktitle =    "{Beyond Loop Level Parallelism in OpenMP:
                 Accelerators, Tasking and More: 6th International
                 Workshop on OpenMP, IWOMP 2010, Tsukuba, Japan, June
                 14--16, 2010 Proceedings}",
  title =        "{Beyond Loop Level Parallelism in OpenMP:
                 Accelerators, Tasking and More: 6th International
                 Workshop on OpenMP, IWOMP 2010, Tsukuba, Japan, June
                 14--16, 2010 Proceedings}",
  volume =       "6132",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "173 (est.)",
  year =         "2010",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-13217-9",
  ISBN =         "3-642-13216-2 (print), 3-642-13217-0 (e-book)",
  ISBN-13 =      "978-3-642-13216-2 (print), 978-3-642-13217-9
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:20:26 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-13217-9",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2011:SSP,
  editor =       "{ACM}",
  booktitle =    "{SC '11 State of the Practice Reports}",
  title =        "{SC '11 State of the Practice Reports}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  year =         "2011",
  ISBN =         "1-4503-1139-3",
  ISBN-13 =      "978-1-4503-1139-7",
  LCCN =         "????",
  bibdate =      "Fri Dec 16 11:20:09 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Chapman:2011:OPE,
  editor =       "Barbara M. Chapman and William D. Gropp and Kalyan
                 Kumaran and Matthias S. M{\"u}ller",
  booktitle =    "{OpenMP in the Petascale Era: 7th International
                 Workshop on OpenMP, IWOMP 2011, Chicago, IL, USA, June
                 13--15, 2011. Proceedings}",
  title =        "{OpenMP in the Petascale Era: 7th International
                 Workshop on OpenMP, IWOMP 2011, Chicago, IL, USA, June
                 13--15, 2011. Proceedings}",
  volume =       "6665",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "178 (est.)",
  year =         "2011",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-21487-5",
  ISBN =         "3-642-21486-X (print), 3-642-21487-8 (e-book)",
  ISBN-13 =      "978-3-642-21486-8 (print), 978-3-642-21487-5
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:23:23 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-21487-5",
  acknowledgement = ack-nhfb,
}

@Proceedings{Cotronis:2011:RAM,
  editor =       "Yiannis Cotronis and Anthony Danalis and Dimitrios S.
                 Nikolopoulos and Jack Dongarra",
  booktitle =    "{Recent Advances in the Message Passing Interface:
                 18th European MPI Users' Group Meeting, EuroMPI 2011,
                 Santorini, Greece, September 18--21, 2011.
                 Proceedings}",
  title =        "{Recent Advances in the Message Passing Interface:
                 18th European MPI Users' Group Meeting, EuroMPI 2011,
                 Santorini, Greece, September 18--21, 2011.
                 Proceedings}",
  volume =       "6960",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "177 (est.)",
  year =         "2011",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-24449-0",
  ISBN =         "3-642-24448-3 (print), 3-642-24449-1 (e-book)",
  ISBN-13 =      "978-3-642-24448-3 (print), 978-3-642-24449-0
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:21:14 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-24449-0",
  acknowledgement = ack-nhfb,
}

@Proceedings{Lathrop:2011:SPI,
  editor =       "Scott Lathrop and Jim Costa and William Kramer",
  booktitle =    "{SC'11: Proceedings of 2011 International Conference
                 for High Performance Computing, Networking, Storage and
                 Analysis, Seattle, WA, November 12--18 2011}",
  title =        "{SC'11: Proceedings of 2011 International Conference
                 for High Performance Computing, Networking, Storage and
                 Analysis, Seattle, WA, November 12--18 2011}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2011",
  ISBN =         "1-4503-0771-X",
  ISBN-13 =      "978-1-4503-0771-0",
  LCCN =         "????",
  bibdate =      "Fri Dec 16 11:11:35 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
  xxeditor =     "{ACM}",
}

@Proceedings{Tromeur-Dervout:2011:PCF,
  editor =       "Damien Tromeur-Dervout and Gunther Brenner and David
                 R. Emerson and Jocelyne Erhel",
  booktitle =    "{Parallel Computational Fluid Dynamics 2008: Parallel
                 Numerical Methods, Software Development and
                 Applications}",
  title =        "{Parallel Computational Fluid Dynamics 2008: Parallel
                 Numerical Methods, Software Development and
                 Applications}",
  volume =       "74",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  bookpages =    "xi + 432",
  pages =        "xi + 432",
  year =         "2011",
  CODEN =        "LNCSA6",
  DOI =          "https://doi.org/10.1007/978-3-642-14438-7",
  ISBN =         "3-642-14437-3 (print), 3-642-14438-1 (e-book)",
  ISBN-13 =      "978-3-642-14437-0 (print), 978-3-642-14438-7
                 (e-book)",
  ISSN =         "1439-7358",
  ISSN-L =       "1439-7358",
  LCCN =         "????",
  bibdate =      "Thu Dec 20 14:35:30 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  note =         "Proceedings of the twentieth meeting, Parallel CFD
                 2008, held May 19--22, 2008 in Lyon, France.",
  series =       ser-LNCSE,
  URL =          "http://link.springer.com/book/10.1007/978-3-642-14438-7;
                 http://www.springerlink.com/content/978-3-642-14438-7",
  acknowledgement = ack-nhfb,
  series-URL =   "http://link.springer.com/bookseries/3527",
  tableofcontents = "Preface \\
                 Contents \\
                 Part I: Invited speakers \\
                 Part II: Optimisation in Aerodynamics Design \\
                 Part III: Grid methods \\
                 Part IV: Boundary methods \\
                 Part V: High Order methods \\
                 Part VI: Parallel Algorithms and Solvers \\
                 Part VII: Lattice Boltzman and SPH Methods \\
                 Part VIII: software Framework and Component
                 Architecture \\
                 Part IX: Parallel Performance \\
                 Part X: Environment and biofluids applications \\
                 Part XI: General fluid \\
                 Editorial Policy",
}

@Proceedings{Chapman:2012:OHW,
  editor =       "Barbara M. Chapman and Federico Massaioli and Matthias
                 S. M{\"u}ller and Marco Rorro",
  booktitle =    "{OpenMP in a Heterogeneous World: 8th International
                 Workshop on OpenMP, IWOMP 2012, Rome, Italy, June
                 11--13, 2012. Proceedings}",
  title =        "{OpenMP in a Heterogeneous World: 8th International
                 Workshop on OpenMP, IWOMP 2012, Rome, Italy, June
                 11--13, 2012. Proceedings}",
  volume =       "7312",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "257 (est.)",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-30961-8",
  ISBN =         "3-642-30960-7 (print), 3-642-30961-5 (e-book)",
  ISBN-13 =      "978-3-642-30960-1 (print), 978-3-642-30961-8
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:19:49 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-30961-8",
  acknowledgement = ack-nhfb,
}

@Proceedings{Hollingsworth:2012:SPI,
  editor =       "Jeffrey Hollingsworth",
  booktitle =    "{SC '12: Proceedings of the International Conference
                 on High Performance Computing, Networking, Storage and
                 Analysis, Salt Lake Convention Center, Salt Lake City,
                 UT, USA, November 10--16, 2012}",
  title =        "{SC '12: Proceedings of the International Conference
                 on High Performance Computing, Networking, Storage and
                 Analysis, Salt Lake Convention Center, Salt Lake City,
                 UT, USA, November 10--16, 2012}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  year =         "2012",
  ISBN =         "1-4673-0804-8",
  ISBN-13 =      "978-1-4673-0804-5",
  bibdate =      "Thu Nov 15 07:35:55 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Traff:2012:RAM,
  editor =       "Jesper Larsson Tr{\"a}ff and Siegfried Benkner and
                 Jack J. Dongarra",
  booktitle =    "{Recent Advances in the Message Passing Interface:
                 19th European MPI Users' Group Meeting, EuroMPI 2012,
                 Vienna, Austria, September 23--26, 2012. Proceedings}",
  title =        "{Recent Advances in the Message Passing Interface:
                 19th European MPI Users' Group Meeting, EuroMPI 2012,
                 Vienna, Austria, September 23--26, 2012. Proceedings}",
  volume =       "7490",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "162 (est.)",
  year =         "2012",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/978-3-642-33518-1",
  ISBN =         "3-642-33517-9 (print), 3-642-33518-7 (e-book)",
  ISBN-13 =      "978-3-642-33517-4 (print), 978-3-642-33518-1
                 (e-book)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  LCCN =         "????",
  bibdate =      "Wed Dec 19 15:23:42 MST 2012",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lncs.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/content/978-3-642-33518-1",
  acknowledgement = ack-nhfb,
}

@Proceedings{Erhel:2014:DDM,
  editor =       "Jocelyne Erhel and Martin J. Gander and Laurence
                 Halpern and G{\'e}raldine Pichot and Taoufik Sassi and
                 Olof Widlund",
  booktitle =    "Domain Decomposition Methods in Science and
                 Engineering {XXI}",
  title =        "Domain Decomposition Methods in Science and
                 Engineering {XXI}",
  volume =       "98",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xx + 973 + 220",
  year =         "2014",
  DOI =          "https://doi.org/10.1007/978-3-319-05789-7",
  ISBN =         "3-319-05788-X (paperback), 3-319-05789-8 (e-book)",
  ISBN-13 =      "978-3-319-05788-0 (paperback), 978-3-319-05789-7
                 (e-book)",
  ISSN =         "1439-7358 (print), 2197-7100 (electronic)",
  ISSN-L =       "1439-7358",
  LCCN =         "QA71-90",
  bibdate =      "Sat Dec 12 10:43:35 MST 2015",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCSE,
  URL =          "http://0-dx.doi.org.fama.us.es/10.1007/978-3-319-05789-7",
  abstract =     "This volume contains a selection of papers presented
                 at the 21st international conference on domain
                 decomposition methods in science and engineering held
                 in Rennes, France, June 25-29, 2012. Domain
                 decomposition is an active and interdisciplinary
                 research discipline, focusing on the development,
                 analysis and implementation of numerical methods for
                 massively parallel computers. Domain decomposition
                 methods are among the most efficient solvers for large
                 scale applications in science and engineering. They are
                 based on a solid theoretical foundation and shown to be
                 scalable for many important applications. Domain
                 decomposition techniques can also naturally take into
                 account multiscale phenomena. This book contains the
                 most recent results in this important field of
                 research, both mathematically and algorithmically and
                 allows the reader to get an overview of this exciting
                 branch of numerical analysis and scientific
                 computing.",
  acknowledgement = ack-nhfb,
  tableofcontents = "Preface \\
                 Part I: Plenary Presentations \\
                 Part II: Minisymposia \\
                 Part III: Contributed Presentations",
}

@Book{Mehl:2015:RTC,
  editor =       "Miriam Mehl and Manfred Bischoff and Michael
                 Sch{\"a}fer",
  booktitle =    "Recent Trends in Computational Engineering ---
                 {CE2014}: Optimization, Uncertainty, Parallel
                 Algorithms, Coupled and Complex Problems",
  title =        "Recent Trends in Computational Engineering ---
                 {CE2014}: Optimization, Uncertainty, Parallel
                 Algorithms, Coupled and Complex Problems",
  volume =       "105",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "317 (est.)",
  year =         "2015",
  ISBN =         "3-319-22996-6, 3-319-22997-4 (e-book)",
  ISBN-13 =      "978-3-319-22996-6, 978-3-319-22997-3 (e-book)",
  LCCN =         "QA71-90; TA329",
  bibdate =      "Sat Dec 12 10:43:43 MST 2015",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 http://www.math.utah.edu/pub/tex/bib/lncse.bib;
                 http://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  series =       ser-LNCSE,
  URL =          "http://www.springerlink.com/content/978-3-319-22997-3",
  acknowledgement = ack-nhfb,
  meetingname =  "International Workshop on Computational Engineering
                 (3rd : 2014 : Stuttgart, Germany)",
  subject =      "Engineering mathematics; Congresses; TECHNOLOGY and
                 ENGINEERING / Engineering (General); TECHNOLOGY and
                 ENGINEERING / Reference; Engineering mathematics.",
}